From f6955b7a6e9069336abe133bc7aa35151324909c Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Fri, 15 Mar 2024 10:47:34 -0400
Subject: [PATCH 001/272] DOC v24.06 Updates [skip ci]

---
 .../cuda11.8-conda/devcontainer.json          |  4 +-
 .devcontainer/cuda11.8-pip/devcontainer.json  |  4 +-
 .../cuda12.2-conda/devcontainer.json          |  4 +-
 .devcontainer/cuda12.2-pip/devcontainer.json  |  4 +-
 .github/workflows/build.yaml                  | 16 ++++----
 .github/workflows/pr.yaml                     | 38 +++++++++----------
 .github/workflows/test.yaml                   | 22 +++++------
 README.md                                     |  2 +-
 VERSION                                       |  2 +-
 .../all_cuda-118_arch-x86_64.yaml             | 10 ++---
 .../all_cuda-122_arch-x86_64.yaml             | 10 ++---
 cpp/examples/fetch_dependencies.cmake         |  2 +-
 dependencies.yaml                             | 32 ++++++++--------
 java/ci/README.md                             |  4 +-
 java/pom.xml                                  |  2 +-
 python/cudf/pyproject.toml                    |  4 +-
 python/cudf_kafka/pyproject.toml              |  4 +-
 python/custreamz/pyproject.toml               |  6 +--
 python/dask_cudf/pyproject.toml               |  6 +--
 19 files changed, 88 insertions(+), 88 deletions(-)

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 6e71505fc7e..9999eebdc97 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,12 +5,12 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 15b51da8dea..90471e0b750 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,12 +5,12 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json
index 31ae8426763..5a61d26e1f5 100644
--- a/.devcontainer/cuda12.2-conda/devcontainer.json
+++ b/.devcontainer/cuda12.2-conda/devcontainer.json
@@ -5,12 +5,12 @@
     "args": {
       "CUDA": "12.2",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-mambaforge-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json
index 93367527a86..29817cdadc3 100644
--- a/.devcontainer/cuda12.2-pip/devcontainer.json
+++ b/.devcontainer/cuda12.2-pip/devcontainer.json
@@ -5,12 +5,12 @@
     "args": {
       "CUDA": "12.2",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-cuda12.2-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda12.2-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index ef2141ed934..1e27f590908 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       build-2_28-wheels: "true"
@@ -80,7 +80,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -90,7 +90,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -102,7 +102,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index e4aed2b2ef8..986acd104cc 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -32,41 +32,41 @@ jobs:
       #- pandas-tests-diff
       #- pandas-tests-diff-comment
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.06
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.06
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.06
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.06
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.06
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
     with:
       build_type: pull-request
       test_script: "ci/test_python_cudf.sh"
@@ -74,14 +74,14 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
     with:
       build_type: pull-request
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -91,7 +91,7 @@ jobs:
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -101,7 +101,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -111,7 +111,7 @@ jobs:
   wheel-build-cudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       build_type: pull-request
       build-2_28-wheels: "true"
@@ -119,14 +119,14 @@ jobs:
   wheel-tests-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -135,7 +135,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -143,7 +143,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@fix/devcontainer-json-location
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.06
     with:
       arch: '["amd64"]'
       cuda: '["12.2"]'
@@ -154,7 +154,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
@@ -163,7 +163,7 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
@@ -175,7 +175,7 @@ jobs:
   #  needs: [pandas-tests-main, pandas-tests-pr]
   #  secrets: inherit
   #  # This branch exports a `job_output` output that the downstream job reads.
-  #  uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+  #  uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
   #  with:
   #    node_type: cpu4
   #    build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 4cb6baf2d63..1f27ffcffe3 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +33,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -45,7 +45,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -55,7 +55,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -76,7 +76,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -97,7 +97,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -108,7 +108,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -118,7 +118,7 @@ jobs:
   pandas-tests:
     # run the Pandas unit tests
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: nightly
diff --git a/README.md b/README.md
index 8f9e57ff3ad..205e16ea0e5 100644
--- a/README.md
+++ b/README.md
@@ -93,7 +93,7 @@ cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
-    cudf=24.04 python=3.11 cuda-version=12.2
+    cudf=24.06 python=3.11 cuda-version=12.2
 ```
 
 We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
diff --git a/VERSION b/VERSION
index 4a2fe8aa570..0bff6981a3d 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.04.00
+24.06.00
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index cf363a819a2..82d7104b0da 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -27,7 +27,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==24.4.*
+- dask-cuda==24.6.*
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -46,10 +46,10 @@ dependencies:
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
-- libkvikio==24.4.*
+- libkvikio==24.6.*
 - libparquet==14.0.2.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==24.4.*
+- librmm==24.6.*
 - make
 - moto>=4.0.8
 - msgpack-python
@@ -80,9 +80,9 @@ dependencies:
 - python-confluent-kafka>=1.9.0,<1.10.0a0
 - python>=3.9,<3.12
 - pytorch>=2.1.0
-- rapids-dask-dependency==24.4.*
+- rapids-dask-dependency==24.6.*
 - rich
-- rmm==24.4.*
+- rmm==24.6.*
 - s3fs>=2022.3.0
 - scikit-build-core>=0.7.0
 - scipy
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 42460532b1b..0fd87e91745 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -28,7 +28,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==24.4.*
+- dask-cuda==24.6.*
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -45,10 +45,10 @@ dependencies:
 - libarrow==14.0.2.*
 - libcufile-dev
 - libcurand-dev
-- libkvikio==24.4.*
+- libkvikio==24.6.*
 - libparquet==14.0.2.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==24.4.*
+- librmm==24.6.*
 - make
 - moto>=4.0.8
 - msgpack-python
@@ -78,9 +78,9 @@ dependencies:
 - python-confluent-kafka>=1.9.0,<1.10.0a0
 - python>=3.9,<3.12
 - pytorch>=2.1.0
-- rapids-dask-dependency==24.4.*
+- rapids-dask-dependency==24.6.*
 - rich
-- rmm==24.4.*
+- rmm==24.6.*
 - s3fs>=2022.3.0
 - scikit-build-core>=0.7.0
 - scipy
diff --git a/cpp/examples/fetch_dependencies.cmake b/cpp/examples/fetch_dependencies.cmake
index a03f84ae142..e4c11bbdeca 100644
--- a/cpp/examples/fetch_dependencies.cmake
+++ b/cpp/examples/fetch_dependencies.cmake
@@ -19,7 +19,7 @@ file(
 )
 include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
 
-set(CUDF_TAG branch-24.04)
+set(CUDF_TAG branch-24.06)
 CPMFindPackage(
   NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
   GIT_TAG ${CUDF_TAG}
diff --git a/dependencies.yaml b/dependencies.yaml
index db0a766df82..85f5a86d938 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -238,8 +238,8 @@ dependencies:
           - &gbench benchmark==1.8.0
           - &gtest gtest>=1.13.0
           - &gmock gmock>=1.13.0
-          - librmm==24.4.*
-          - libkvikio==24.4.*
+          - librmm==24.6.*
+          - libkvikio==24.6.*
           - librdkafka>=1.9.0,<1.10.0a0
           # Align nvcomp version with rapids-cmake
           - nvcomp==3.0.6
@@ -275,7 +275,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &rmm_conda rmm==24.4.*
+          - &rmm_conda rmm==24.6.*
           - &protobuf protobuf>=4.21,<5
           - pip
           - pip:
@@ -295,10 +295,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages: &build_python_packages_cu12
-              - &rmm_cu12 rmm-cu12==24.4.*
+              - &rmm_cu12 rmm-cu12==24.6.*
           - matrix: {cuda: "11.*"}
             packages: &build_python_packages_cu11
-              - &rmm_cu11 rmm-cu11==24.4.*
+              - &rmm_cu11 rmm-cu11==24.6.*
           - {matrix: null, packages: null }
       - output_types: pyproject
         matrices:
@@ -456,7 +456,7 @@ dependencies:
       - output_types: [conda]
         packages:
           - breathe>=4.35.0
-          - dask-cuda==24.4.*
+          - dask-cuda==24.6.*
           - *doxygen
           - make
           - myst-nb
@@ -548,11 +548,11 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - rmm-cu12==24.4.*
+              - rmm-cu12==24.6.*
               - pynvjitlink-cu12
           - matrix: {cuda: "11.*"}
             packages:
-              - rmm-cu11==24.4.*
+              - rmm-cu11==24.6.*
               - cubinlinker-cu11
               - ptxcompiler-cu11
           - {matrix: null, packages: null}
@@ -563,7 +563,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - rapids-dask-dependency==24.4.*
+          - rapids-dask-dependency==24.6.*
   run_custreamz:
     common:
       - output_types: conda
@@ -652,13 +652,13 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask-cuda==24.4.*
+          - dask-cuda==24.6.*
           - *numba
   depends_on_cudf:
     common:
       - output_types: conda
         packages:
-          - &cudf_conda cudf==24.4.*
+          - &cudf_conda cudf==24.6.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -670,16 +670,16 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cudf-cu12==24.4.*
+              - cudf-cu12==24.6.*
           - matrix: {cuda: "11.*"}
             packages:
-              - cudf-cu11==24.4.*
+              - cudf-cu11==24.6.*
           - {matrix: null, packages: [*cudf_conda]}
   depends_on_cudf_kafka:
     common:
       - output_types: conda
         packages:
-          - &cudf_kafka_conda cudf_kafka==24.4.*
+          - &cudf_kafka_conda cudf_kafka==24.6.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -691,10 +691,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cudf_kafka-cu12==24.4.*
+              - cudf_kafka-cu12==24.6.*
           - matrix: {cuda: "11.*"}
             packages:
-              - cudf_kafka-cu11==24.4.*
+              - cudf_kafka-cu11==24.6.*
           - {matrix: null, packages: [*cudf_kafka_conda]}
   depends_on_cupy:
     common:
diff --git a/java/ci/README.md b/java/ci/README.md
index ba039acc45d..da24c5923ea 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.8.0-devel-centos7 bash
 You can download the cuDF repo in the docker container or you can mount it into the container.
 Here I choose to download again in the container.
 ```bash
-git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.04
+git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.06
 ```
 
 ### Build cuDF jar with devtoolset
@@ -47,4 +47,4 @@ scl enable devtoolset-11 "java/ci/build-in-docker.sh"
 
 ### The output
 
-You can find the cuDF jar in java/target/ like cudf-24.04.0-SNAPSHOT-cuda11.jar.
+You can find the cuDF jar in java/target/ like cudf-24.06.0-SNAPSHOT-cuda11.jar.
diff --git a/java/pom.xml b/java/pom.xml
index 8b2fdcaa85f..46b5ce4c083 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -21,7 +21,7 @@
 
     <groupId>ai.rapids</groupId>
     <artifactId>cudf</artifactId>
-    <version>24.04.0-SNAPSHOT</version>
+    <version>24.06.0-SNAPSHOT</version>
 
     <name>cudfjni</name>
     <description>
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index da574fdb031..003a92988de 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
     "numpy==1.23.*",
     "protoc-wheel",
     "pyarrow==14.0.2.*",
-    "rmm==24.4.*",
+    "rmm==24.6.*",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
@@ -38,7 +38,7 @@ dependencies = [
     "ptxcompiler",
     "pyarrow>=14.0.1,<15.0.0a0",
     "rich",
-    "rmm==24.4.*",
+    "rmm==24.6.*",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 7369b99aaf4..eb48852202a 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 [build-system]
 build-backend = "scikit_build_core.build"
@@ -22,7 +22,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.4.*",
+    "cudf==24.6.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.optional-dependencies]
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index ccaa2543cc3..e6c86351ac9 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 [build-system]
 build-backend = "setuptools.build_meta"
@@ -19,8 +19,8 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "confluent-kafka>=1.9.0,<1.10.0a0",
-    "cudf==24.4.*",
-    "cudf_kafka==24.4.*",
+    "cudf==24.6.*",
+    "cudf_kafka==24.6.*",
     "streamz",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index b55bb9d3eaf..d0743516c4d 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -18,12 +18,12 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.4.*",
+    "cudf==24.6.*",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "numpy>=1.23,<2.0a0",
     "pandas>=2.0,<2.2.2dev0",
-    "rapids-dask-dependency==24.4.*",
+    "rapids-dask-dependency==24.6.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -44,7 +44,7 @@ cudf = "dask_cudf.backends:CudfDXBackendEntrypoint"
 
 [project.optional-dependencies]
 test = [
-    "dask-cuda==24.4.*",
+    "dask-cuda==24.6.*",
     "numba>=0.57",
     "pytest-cov",
     "pytest-xdist",

From f0919494ad874dd23cb63630272165c41f9ea144 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 18 Mar 2024 12:59:30 -0500
Subject: [PATCH 002/272] Allow ``numeric_only=True`` for simple groupby
 reductions (#15326)

Adds some simple logic to handle the case that `DataFrameGroupBy._reduce(numeric_only=True)` is called.

## Further Background
This change is needed for some dask_cudf groupby aggregations (e.g. "mean") to work with the latest `dask/dask-expr:main`. Although other workarounds and "fixes" are possible, the easiest solution is probably something like this PR.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15326
---
 python/cudf/cudf/core/groupby/groupby.py | 29 +++++++++++++++++++----
 python/cudf/cudf/tests/test_groupby.py   | 30 ++++++++++++++++++++++--
 2 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index d995964057b..945e546af1a 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -22,7 +22,12 @@
 from cudf._lib.types import size_type_dtype
 from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
 from cudf.api.extensions import no_default
-from cudf.api.types import is_bool_dtype, is_float_dtype, is_list_like
+from cudf.api.types import (
+    is_bool_dtype,
+    is_float_dtype,
+    is_list_like,
+    is_numeric_dtype,
+)
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.abc import Serializable
 from cudf.core.column.column import ColumnBase, StructDtype, as_column
@@ -701,6 +706,11 @@ def agg(self, func):
 
         return result
 
+    def _reduce_numeric_only(self, op: str):
+        raise NotImplementedError(
+            f"numeric_only is not implemented for {type(self)}"
+        )
+
     def _reduce(
         self,
         op: str,
@@ -731,14 +741,12 @@ def _reduce(
 
             The numeric_only, min_count
         """
-        if numeric_only:
-            raise NotImplementedError(
-                "numeric_only parameter is not implemented yet"
-            )
         if min_count != 0:
             raise NotImplementedError(
                 "min_count parameter is not implemented yet"
             )
+        if numeric_only:
+            return self._reduce_numeric_only(op)
         return self.agg(op)
 
     def _scan(self, op: str, *args, **kwargs):
@@ -2648,6 +2656,17 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
 
     _PROTECTED_KEYS = frozenset(("obj",))
 
+    def _reduce_numeric_only(self, op: str):
+        columns = list(
+            name
+            for name in self.obj._data.names
+            if (
+                is_numeric_dtype(self.obj._data[name].dtype)
+                and name not in self.grouping.names
+            )
+        )
+        return self[columns].agg(op)
+
     def __getitem__(self, key):
         return self.obj[key].groupby(
             by=self.grouping.keys,
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 06516b6b4ea..c139b06d20f 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -1259,7 +1259,7 @@ def test_groupby_unsupported_columns():
     pdg = pdf.groupby("x").sum(numeric_only=True)
     # cudf does not yet support numeric_only, so our default is False (unlike
     # pandas, which defaults to inferring and throws a warning about it).
-    gdg = gdf.groupby("x").sum()
+    gdg = gdf.groupby("x").sum(numeric_only=True)
     assert_groupby_results_equal(pdg, gdg)
 
 
@@ -2158,7 +2158,9 @@ def test_groupby_list_columns_excluded():
     pandas_agg_result = pdf.groupby("a").agg("mean", numeric_only=True)
 
     assert_groupby_results_equal(
-        pandas_result, gdf.groupby("a").mean(), check_dtype=False
+        pandas_result,
+        gdf.groupby("a").mean(numeric_only=True),
+        check_dtype=False,
     )
 
     assert_groupby_results_equal(
@@ -3826,3 +3828,27 @@ def test_groupby_shift_series_multiindex():
     result = ser.groupby(level=0).shift(1)
     expected = ser.to_pandas().groupby(level=0).shift(1)
     assert_eq(expected, result)
+
+
+@pytest.mark.parametrize(
+    "func", ["min", "max", "sum", "mean", "idxmin", "idxmax"]
+)
+@pytest.mark.parametrize(
+    "by,data",
+    [
+        ("a", {"a": [1, 2, 3]}),
+        (["a", "id"], {"id": [0, 0, 1], "a": [1, 2, 3]}),
+        ("a", {"a": [1, 2, 3], "b": ["A", "B", "C"]}),
+        ("id", {"id": [0, 0, 1], "a": [1, 2, 3], "b": ["A", "B", "C"]}),
+        (["b", "id"], {"id": [0, 0, 1], "b": ["A", "B", "C"]}),
+        ("b", {"b": ["A", "B", "C"]}),
+    ],
+)
+def test_group_by_reduce_numeric_only(by, data, func):
+    # Test that simple groupby reductions support numeric_only=True
+    df = cudf.DataFrame(data)
+    expected = getattr(df.to_pandas().groupby(by, sort=True), func)(
+        numeric_only=True
+    )
+    result = getattr(df.groupby(by, sort=True), func)(numeric_only=True)
+    assert_eq(expected, result)

From ae60f1dd4acd9e786ccd9165b0ba7d5f8286b914 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Tue, 19 Mar 2024 08:36:00 -0500
Subject: [PATCH 003/272] Avoid duplicate dask-cudf testing (#15333)

Sets `DASK_DATAFRAME__QUERY_PLANNING` explicitly in tests to avoid duplicate testing of dask-expr once dask version is unpinned.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15333
---
 ci/test_python_other.sh    | 11 +++++------
 ci/test_wheel_dask_cudf.sh | 12 ++++++------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh
index 8ecd02f70a1..cbc1dc1cb87 100755
--- a/ci/test_python_other.sh
+++ b/ci/test_python_other.sh
@@ -19,8 +19,8 @@ EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e
 
-rapids-logger "pytest dask_cudf"
-./ci/run_dask_cudf_pytests.sh \
+rapids-logger "pytest dask_cudf (dask-expr)"
+DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \
   --numprocesses=8 \
   --dist=worksteal \
@@ -29,10 +29,9 @@ rapids-logger "pytest dask_cudf"
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cudf-coverage.xml" \
   --cov-report=term
 
-# Run tests in dask_cudf/tests and dask_cudf/io/tests with dask-expr
-rapids-logger "pytest dask_cudf + dask_expr"
-DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \
-  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-expr.xml" \
+rapids-logger "pytest dask_cudf (legacy)"
+DASK_DATAFRAME__QUERY_PLANNING=False ./ci/run_dask_cudf_pytests.sh \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \
   --numprocesses=8 \
   --dist=loadscope \
   .
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index 398eed43ea4..d7fb60e5075 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -31,19 +31,19 @@ RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"
 
 # Run tests in dask_cudf/tests and dask_cudf/io/tests
-rapids-logger "pytest dask_cudf"
+rapids-logger "pytest dask_cudf (dask-expr)"
 pushd python/dask_cudf/dask_cudf
-python -m pytest \
+DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \
   --numprocesses=8 \
   .
 popd
 
-# Run tests in dask_cudf/tests and dask_cudf/io/tests with dask-expr
-rapids-logger "pytest dask_cudf + dask_expr"
+# Run tests in dask_cudf/tests and dask_cudf/io/tests (legacy)
+rapids-logger "pytest dask_cudf (legacy)"
 pushd python/dask_cudf/dask_cudf
-DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \
-  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-expr.xml" \
+DASK_DATAFRAME__QUERY_PLANNING=False python -m pytest \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \
   --numprocesses=8 \
   .
 popd

From 819e819e5c0ad9b2f84d8e3ce94982a6f2b1f373 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 19 Mar 2024 20:15:43 -0500
Subject: [PATCH 004/272] Disable dask-expr in docs builds. (#15343)

Fixes CI blocked by dask-expr.

xref:
- https://github.com/rapidsai/cudf/pull/14805
- https://github.com/rapidsai/rapids-dask-dependency/pull/33

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15343
---
 ci/build_docs.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 8e22f02b484..fc02fe7548c 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -41,6 +41,9 @@ mkdir -p "${RAPIDS_DOCS_DIR}/libcudf/html"
 mv html/* "${RAPIDS_DOCS_DIR}/libcudf/html"
 popd
 
+# TODO: Remove this once dask-expr works in the 10min notebook
+export DASK_DATAFRAME__QUERY_PLANNING=False
+
 rapids-logger "Build Python docs"
 pushd docs/cudf
 make dirhtml

From ae16ecbb8ad278498d51697d5bad211f5e7f4325 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 19 Mar 2024 22:14:00 -0500
Subject: [PATCH 005/272] Drop CentOS 7 support. (#15323)

This PR tests https://github.com/rapidsai/shared-workflows/pull/192, which drops CentOS 7 support. See https://github.com/rapidsai/build-planning/issues/23.

This PR removes the logic needed to build and test both `manylinux_2_17` and `manylinux_2_28` wheels, as we will only be building for `manylinux_2_28`.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15323
---
 .github/workflows/build.yaml               |  1 -
 .github/workflows/pr.yaml                  |  1 -
 ci/build_wheel_cudf.sh                     |  2 +-
 ci/cudf_pandas_scripts/pandas-tests/run.sh | 15 +----------
 ci/cudf_pandas_scripts/run_tests.sh        | 17 ++----------
 ci/test_wheel_cudf.sh                      | 15 +----------
 ci/test_wheel_dask_cudf.sh                 | 15 +----------
 cpp/cmake/thirdparty/get_arrow.cmake       | 31 ----------------------
 8 files changed, 6 insertions(+), 91 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 1e27f590908..67c451fbd6e 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -72,7 +72,6 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
-      build-2_28-wheels: "true"
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 224e5221a5b..303988212d3 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -114,7 +114,6 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       build_type: pull-request
-      build-2_28-wheels: "true"
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: wheel-build-cudf
diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
index cde22bb70d1..f0886a28fd9 100755
--- a/ci/build_wheel_cudf.sh
+++ b/ci/build_wheel_cudf.sh
@@ -13,4 +13,4 @@ python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
 
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="cudf_${AUDITWHEEL_POLICY}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh
index f3c37ecde26..1f70ca78c41 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/run.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh
@@ -10,21 +10,8 @@ PANDAS_TESTS_BRANCH=${1}
 rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch"
 rapids-logger "PR number: ${RAPIDS_REF_NAME:-"unknown"}"
 
-# Set the manylinux version used for downloading the wheels so that we test the
-# newer ABI wheels on the newer images that support their installation.
-# Need to disable pipefail for the head not to fail, see
-# https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q
-set +o pipefail
-glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
-set -o pipefail
-manylinux_version="2_17"
-if [[ ${glibc_minor_version} -ge 28 ]]; then
-    manylinux_version="2_28"
-fi
-manylinux="manylinux_${manylinux_version}"
-
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
 python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,pandas-tests]
 
 RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index 4f1e4bbf993..78945d37f22 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -31,21 +31,8 @@ done
 if [ "$no_cudf" = true ]; then
     echo "Skipping cudf install"
 else
-    # Set the manylinux version used for downloading the wheels so that we test the
-    # newer ABI wheels on the newer images that support their installation.
-    # Need to disable pipefail for the head not to fail, see
-    # https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q
-    set +o pipefail
-    glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
-    set -o pipefail
-    manylinux_version="2_17"
-    if [[ ${glibc_minor_version} -ge 28 ]]; then
-        manylinux_version="2_28"
-    fi
-    manylinux="manylinux_${manylinux_version}"
-
     RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-    RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+    RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
     python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,cudf-pandas-tests]
 fi
 
diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index af5779f478a..83f0b976128 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -3,21 +3,8 @@
 
 set -eou pipefail
 
-# Set the manylinux version used for downloading the wheels so that we test the
-# newer ABI wheels on the newer images that support their installation.
-# Need to disable pipefail for the head not to fail, see
-# https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q
-set +o pipefail
-glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
-set -o pipefail
-manylinux_version="2_17"
-if [[ ${glibc_minor_version} -ge 28 ]]; then
-    manylinux_version="2_28"
-fi
-manylinux="manylinux_${manylinux_version}"
-
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/cudf*.whl)[test]
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index d7fb60e5075..2b20b9d9ce4 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -7,20 +7,7 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
 
 # Download the cudf built in the previous step
-# Set the manylinux version used for downloading the wheels so that we test the
-# newer ABI wheels on the newer images that support their installation.
-# Need to disable pipefail for the head not to fail, see
-# https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q
-set +o pipefail
-glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
-set -o pipefail
-manylinux_version="2_17"
-if [[ ${glibc_minor_version} -ge 28 ]]; then
-    manylinux_version="2_28"
-fi
-manylinux="manylinux_${manylinux_version}"
-
-RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
 python -m pip install --no-deps ./local-cudf-dep/cudf*.whl
 
 # echo to expand wildcard before adding `[extra]` requires for pip
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 114a1f98a68..892056959c8 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -68,37 +68,6 @@ list(POP_BACK CMAKE_PREFIX_PATH)
   find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL)
   add_library(arrow_shared ALIAS Arrow::Arrow)
 
-  # When using the libarrow inside a wheel, whether or not libcudf may be built using the new C++11
-  # ABI is dependent on whether the libarrow inside the wheel was compiled using that ABI because we
-  # need the arrow library that we bundle in cudf to be ABI-compatible with the one inside pyarrow.
-  # We determine what options to use by checking the glibc version on the current system, which is
-  # also how pip determines which manylinux-versioned pyarrow wheel to install. Note that tests will
-  # not build successfully without also propagating these options to builds of GTest. Similarly,
-  # benchmarks will not work without updating GBench (and possibly NVBench) builds. We are currently
-  # ignoring these limitations since we don't anticipate using this feature except for building
-  # wheels.
-  enable_language(C)
-  execute_process(
-    COMMAND ${CMAKE_C_COMPILER} -print-file-name=libc.so.6
-    OUTPUT_VARIABLE GLIBC_EXECUTABLE
-    OUTPUT_STRIP_TRAILING_WHITESPACE
-  )
-  execute_process(
-    COMMAND ${GLIBC_EXECUTABLE}
-    OUTPUT_VARIABLE GLIBC_OUTPUT
-    OUTPUT_STRIP_TRAILING_WHITESPACE
-  )
-  string(REGEX MATCH "stable release version ([0-9]+\\.[0-9]+)" GLIBC_VERSION ${GLIBC_OUTPUT})
-  string(REPLACE "stable release version " "" GLIBC_VERSION ${GLIBC_VERSION})
-  string(REPLACE "." ";" GLIBC_VERSION_LIST ${GLIBC_VERSION})
-  list(GET GLIBC_VERSION_LIST 1 GLIBC_VERSION_MINOR)
-  if(GLIBC_VERSION_MINOR LESS 28)
-    target_compile_options(
-      Arrow::Arrow INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-D_GLIBCXX_USE_CXX11_ABI=0>"
-                             "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0>"
-    )
-  endif()
-
   rapids_export_package(BUILD Arrow cudf-exports)
   rapids_export_package(INSTALL Arrow cudf-exports)
 endfunction()

From 4456428784d8bf5be343b6f2b3527013a054ff99 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 21 Mar 2024 08:42:50 -0400
Subject: [PATCH 006/272] Add debug tips section to libcudf developer guide
 (#15329)

Adds a debugging tips section to the developer guide.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15329
---
 CONTRIBUTING.md                               |  2 +-
 .../developer_guide/DEVELOPER_GUIDE.md        | 22 +++++++++++++++++++
 cpp/doxygen/developer_guide/TESTING.md        | 15 ++++++++++---
 3 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e7f7a20e307..dce92d7e613 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -217,7 +217,7 @@ cuda-gdb -ex r --args python <program_name>.py <program_arguments>
 ```
 
 ```bash
-cuda-memcheck python <program_name>.py <program_arguments>
+compute-sanitizer --tool memcheck python <program_name>.py <program_arguments>
 ```
 
 ### Device debug symbols
diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index 8188c466312..ce9840050a9 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -1384,3 +1384,25 @@ cuIO is a component of libcudf that provides GPU-accelerated reading and writing
 formats commonly used in data analytics, including CSV, Parquet, ORC, Avro, and JSON_Lines.
 
 // TODO: add more detail and move to a separate file.
+
+# Debugging Tips
+
+Here are some tools that can help with debugging libcudf (besides printf of course):
+1. `cuda-gdb`\
+   Follow the instructions in the [Contributor to cuDF guide](../../../CONTRIBUTING.md#debugging-cudf) to build
+   and run libcudf with debug symbols.
+2. `compute-sanitizer`\
+   The [CUDA Compute Sanitizer](https://docs.nvidia.com/compute-sanitizer/ComputeSanitizer/index.html)
+   tool can be used to locate many CUDA reported errors by providing a call stack
+   close to where the error occurs even with a non-debug build. The sanitizer includes various
+   tools including `memcheck`, `racecheck`, and `initcheck` as well as others.
+   The `racecheck` and `initcheck` have been known to produce false positives.
+3. `cudf::test::print()`\
+   The `print()` utility can be called within a gtest to output the data in a `cudf::column_view`.
+   More information is available in the [Testing Guide](TESTING.md#printing-and-accessing-column-data)
+4. GCC Address Sanitizer\
+   The GCC ASAN can also be used by adding the `-fsanitize=address` compiler flag.
+   There is a compatibility issue with the CUDA runtime that can be worked around by setting
+   environment variable `ASAN_OPTIONS=protect_shadow_gap=0` before running the executable.
+   Note that the CUDA `compute-sanitizer` can also be used with GCC ASAN by setting the
+   environment variable `ASAN_OPTIONS=protect_shadow_gap=0,alloc_dealloc_mismatch=0`.
diff --git a/cpp/doxygen/developer_guide/TESTING.md b/cpp/doxygen/developer_guide/TESTING.md
index a4ffe0f575b..9c86be5a55d 100644
--- a/cpp/doxygen/developer_guide/TESTING.md
+++ b/cpp/doxygen/developer_guide/TESTING.md
@@ -455,10 +455,19 @@ Column comparison functions in the `cudf::test::detail` namespace should **NOT**
 
 ### Printing and accessing column data
 
-`include/cudf_test/column_utilities.hpp` defines various functions and overloads for printing
+The `<cudf_test/debug_utilities.hpp>` header defines various functions and overloads for printing
 columns (`print`), converting column data to string (`to_string`, `to_strings`), and copying data to
-the host (`to_host`).
-
+the host (`to_host`). For example, to print a `cudf::column_view` contents or `column_wrapper` instance
+to the console use the `cudf::test::print()`:
+```cpp
+  cudf::test::fixed_width_column_wrapper<int32_t> input({1,2,3,4});
+  auto splits = cudf::split(input,{2});
+  cudf::test::print(input);
+  cudf::test::print(splits.front());
+```
+Fixed-width and strings columns output as comma-separated entries including null rows.
+Nested columns are also supported and output includes the offsets and data children as well as
+the null mask bits.
 
 ## Validating Stream Usage
 

From ebd2ce7c08423ff2c16a1729fecb11fb1908562b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 21 Mar 2024 08:48:42 -1000
Subject: [PATCH 007/272] Use ruff pydocstyle over pydocstyle pre-commit hook
 (#15345)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Most rules were able to translate over except `D302` (`Use u”“” for Unicode docstrings`), which is probably not needed anymore

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15345
---
 .pre-commit-config.yaml                        | 12 ------------
 .../developer_guide/contributing_guide.md      |  2 --
 .../source/developer_guide/documentation.md    |  2 +-
 pyproject.toml                                 | 18 +++---------------
 python/cudf/cudf/core/column/column.py         |  2 ++
 python/cudf/cudf/core/dataframe.py             |  3 +--
 python/cudf/cudf/core/reshape.py               |  4 +++-
 python/cudf/cudf/utils/ioutils.py              |  1 +
 python/dask_cudf/dask_cudf/accessors.py        |  5 ++++-
 9 files changed, 15 insertions(+), 34 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ce5d4f93444..06fdcb9f761 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -38,18 +38,6 @@ repos:
                "python/cudf_kafka/cudf_kafka",
                "python/dask_cudf/dask_cudf"]
         pass_filenames: false
-  - repo: https://github.com/PyCQA/pydocstyle
-    rev: 6.3.0
-    hooks:
-      - id: pydocstyle
-        # https://github.com/PyCQA/pydocstyle/issues/603
-        additional_dependencies: [tomli]
-        args: ["--config=pyproject.toml"]
-        exclude: |
-          (?x)^(
-            ^python/cudf/cudf/pandas/scripts/.*|
-            ^python/cudf/cudf_pandas_tests/.*
-          )
   - repo: https://github.com/nbQA-dev/nbQA
     rev: 1.7.1
     hooks:
diff --git a/docs/cudf/source/developer_guide/contributing_guide.md b/docs/cudf/source/developer_guide/contributing_guide.md
index 65b0e4e3f41..6fce268f309 100644
--- a/docs/cudf/source/developer_guide/contributing_guide.md
+++ b/docs/cudf/source/developer_guide/contributing_guide.md
@@ -16,12 +16,10 @@ The `.pre-commit-config.yaml` file at the root of the repo is the primary source
 Specifically, cuDF uses the following tools:
 
 - [`ruff`](https://beta.ruff.rs/) checks for general code formatting compliance.
-- [`black`](https://github.com/psf/black) is an automatic code formatter.
 - [`isort`](https://pycqa.github.io/isort/) ensures imports are sorted consistently.
 - [`mypy`](http://mypy-lang.org/) performs static type checking.
   In conjunction with [type hints](https://docs.python.org/3/library/typing.html),
   `mypy` can help catch various bugs that are otherwise difficult to find.
-- [`pydocstyle`](https://github.com/PyCQA/pydocstyle/) lints docstring style.
 - [`codespell`](https://github.com/codespell-project/codespell) finds spelling errors.
 
 Linter config data is stored in a number of files.
diff --git a/docs/cudf/source/developer_guide/documentation.md b/docs/cudf/source/developer_guide/documentation.md
index 26557de917a..c8da689479c 100644
--- a/docs/cudf/source/developer_guide/documentation.md
+++ b/docs/cudf/source/developer_guide/documentation.md
@@ -72,7 +72,7 @@ Our guidelines include one addition to the standard the `numpydoc` guide.
 Class properties, which are not explicitly covered, should be documented in the getter function.
 That choice makes `help` more useful as well as enabling docstring inheritance in subclasses.
 
-All of our docstrings are validated using [`pydocstyle`](http://www.pydocstyle.org/en/stable/).
+All of our docstrings are validated using [`ruff pydocstyle rules`](https://docs.astral.sh/ruff/rules/#pydocstyle-d).
 This ensures that docstring style is consistent and conformant across the codebase.
 
 ## Published documentation
diff --git a/pyproject.toml b/pyproject.toml
index c71394058df..28eac66c1d6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,19 +1,5 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
-[tool.pydocstyle]
-# Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather
-# than include using match-dir. Note that as discussed in
-# https://stackoverflow.com/questions/65478393/how-to-filter-directories-using-the-match-dir-flag-for-pydocstyle,
-# unlike the match option above this match-dir will have no effect when
-# pydocstyle is invoked from pre-commit. Therefore this exclusion list must
-# also be maintained in the pre-commit config file.
-match-dir = "^(?!(ci|cpp|conda|docs|java|notebooks|python/cudf/cudf/pandas/scripts|python/cudf/cudf_pandas_tests)).*$"
-# Allow missing docstrings for docutils
-ignore-decorators = ".*(docutils|doc_apply|copy_docstring).*"
-select = "D201, D204, D206, D207, D208, D209, D210, D211, D214, D215, D300, D301, D302, D403, D405, D406, D407, D408, D409, D410, D411, D412, D414, D418"
-    # Would like to enable the following rules in the future:
-    # D200, D202, D205, D400
-
 [tool.mypy]
 ignore_missing_imports = true
 # If we don't specify this, then mypy will check excluded files if
@@ -38,7 +24,7 @@ builtin = "clear"
 quiet-level = 3
 
 [tool.ruff]
-select = ["E", "F", "W"]
+select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418"]
 ignore = [
     # whitespace before :
     "E203",
@@ -55,3 +41,5 @@ line-length = 79
 [tool.ruff.per-file-ignores]
 # Lots of pytest implicitly injected attributes in conftest-patch.py
 "python/cudf/cudf/pandas/scripts/conftest-patch.py" = ["F821"]
+"python/cudf/cudf/pandas/scripts/*" = ["D"]
+"python/cudf/cudf_pandas_tests/*" = ["D"]
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index f13d8cf12f7..2541e076250 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2382,10 +2382,12 @@ def serialize_columns(columns) -> Tuple[List[dict], List]:
     """
     Return the headers and frames resulting
     from serializing a list of Column
+
     Parameters
     ----------
     columns : list
         list of Columns to serialize
+
     Returns
     -------
     headers : list
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 35588725655..da0a969b70c 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4798,7 +4798,6 @@ def apply_chunks(
 
         Examples
         --------
-
         For ``tpb > 1``, ``func`` is executed by ``tpb`` number of threads
         concurrently.  To access the thread id and count,
         use ``numba.cuda.threadIdx.x`` and ``numba.cuda.blockDim.x``,
@@ -4824,7 +4823,7 @@ def apply_chunks(
         ...          z = in3[i]
         ...          out1[i] = x * y + z
 
-        See also
+        See Also
         --------
         DataFrame.apply_rows
         """
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 2ef39e9357d..9008d2f3a1b 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -23,7 +23,8 @@
 
 
 def _align_objs(objs, how="outer", sort=None):
-    """Align a set of Series or Dataframe objects.
+    """
+    Align a set of Series or Dataframe objects.
 
     Parameters
     ----------
@@ -31,6 +32,7 @@ def _align_objs(objs, how="outer", sort=None):
     how : How to handle indexes on other axis (or axes),
     similar to join in concat
     sort : Whether to sort the resulting Index
+
     Returns
     -------
     A list of reindexed and aligned objects
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 85abf438efb..0a0ee4f592c 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -1844,6 +1844,7 @@ def stringify_pathlike(pathlike):
     """
     Convert any object that implements the fspath protocol
     to a string. Leaves other objects unchanged
+
     Parameters
     ----------
     pathlike
diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py
index 1c21fca51c8..47b22696415 100644
--- a/python/dask_cudf/dask_cudf/accessors.py
+++ b/python/dask_cudf/dask_cudf/accessors.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 
 class StructMethods:
@@ -9,14 +9,17 @@ def field(self, key):
         """
         Extract children of the specified struct column
         in the Series
+
         Parameters
         ----------
         key: int or str
             index/position or field name of the respective
             struct column
+
         Returns
         -------
         Series
+
         Examples
         --------
         >>> s = cudf.Series([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}])

From 80a02c6f9a6ca6a6bfc20a25553426026e0d4be4 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 21 Mar 2024 18:54:15 +0000
Subject: [PATCH 008/272] Remove boundscheck=False setting in cython files
 (#15362)

Since the performance in these files is not critical, we don't need to elide bounds checking in (for example) list accesses.

- Closes #15360

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - https://github.com/brandon-b-miller
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15362
---
 python/cudf/cudf/_lib/json.pyx    | 2 --
 python/cudf/cudf/_lib/parquet.pyx | 2 --
 2 files changed, 4 deletions(-)

diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 9bbad0f61c3..f2e03391f08 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -1,7 +1,5 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
-# cython: boundscheck = False
-
 import io
 import os
 from collections import abc
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index d3f5b423373..ce1cba59bec 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -1,7 +1,5 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
-# cython: boundscheck = False
-
 import io
 
 import pyarrow as pa

From b29fc1df66306298e2324f0a23a5ebf20c543216 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 22 Mar 2024 09:54:59 -0400
Subject: [PATCH 009/272] Rework cudf::find_and_replace_all to use gather-based
 make_strings_column (#15305)

Reworks `cudf::find_and_replace_all` for strings to work with long strings and enable it to support large strings.
The custom kernels were replaced with a gather-based `make_strings_column` already optimized for long and short strings.
Large strings will automatically be supported in `make_strings_column` in a future PR.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15305
---
 cpp/CMakeLists.txt                          |   1 +
 cpp/include/cudf/strings/detail/replace.hpp |  18 ++
 cpp/src/replace/replace.cu                  | 212 +-------------------
 cpp/src/strings/replace/find_replace.cu     |  87 ++++++++
 cpp/tests/replace/replace_tests.cpp         |   8 +-
 5 files changed, 111 insertions(+), 215 deletions(-)
 create mode 100644 cpp/src/strings/replace/find_replace.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 12837c69e59..618d03f7078 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -590,6 +590,7 @@ add_library(
   src/strings/regex/regex_program.cpp
   src/strings/repeat_strings.cu
   src/strings/replace/backref_re.cu
+  src/strings/replace/find_replace.cu
   src/strings/replace/multi.cu
   src/strings/replace/multi_re.cu
   src/strings/replace/replace.cu
diff --git a/cpp/include/cudf/strings/detail/replace.hpp b/cpp/include/cudf/strings/detail/replace.hpp
index 28027291b28..0f050f057fa 100644
--- a/cpp/include/cudf/strings/detail/replace.hpp
+++ b/cpp/include/cudf/strings/detail/replace.hpp
@@ -81,6 +81,24 @@ std::unique_ptr<column> replace_slice(strings_column_view const& strings,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Return a copy of `input` replacing any `values_to_replace[i]`
+ * found with `replacement_values[i]`
+ *
+ * @param input The column to find and replace values
+ * @param values_to_replace The values to find
+ * @param replacement_values The corresponding replacement values
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Copy of `input` with specified values replaced
+ */
+std::unique_ptr<cudf::column> find_and_replace_all(
+  cudf::strings_column_view const& input,
+  cudf::strings_column_view const& values_to_replace,
+  cudf::strings_column_view const& replacement_values,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr);
+
 }  // namespace detail
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 88d5d3a2375..91a0ced791a 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -45,7 +45,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/replace.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/replace.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -57,7 +57,6 @@
 #include <thrust/execution_policy.h>
 #include <thrust/find.h>
 #include <thrust/pair.h>
-#include <thrust/tuple.h>
 
 namespace {  // anonymous
 
@@ -87,140 +86,6 @@ __device__ auto get_new_value(cudf::size_type idx,
   return thrust::make_pair(new_value, output_is_valid);
 }
 
-__device__ int get_new_string_value(cudf::size_type idx,
-                                    cudf::column_device_view& input,
-                                    cudf::column_device_view& values_to_replace,
-                                    cudf::column_device_view&)
-{
-  cudf::string_view input_string = input.element<cudf::string_view>(idx);
-  int match                      = -1;
-  for (int i = 0; i < values_to_replace.size(); i++) {
-    cudf::string_view value_string = values_to_replace.element<cudf::string_view>(i);
-    if (input_string == value_string) {
-      match = i;
-      break;
-    }
-  }
-  return match;
-}
-
-/**
- * @brief Kernel which does the first pass of strings replace.
- *
- * It computes the output null_mask, null_count, and the offsets.
- *
- * @param input The input column to replace strings in.
- * @param values_to_replace The string values to replace.
- * @param replacement The replacement values.
- * @param offsets The column which will contain the offsets of the new string column
- * @param indices Temporary column used to store the replacement indices
- * @param output_valid The output null_mask
- * @param output_valid_count The output valid count
- */
-template <bool input_has_nulls, bool replacement_has_nulls>
-CUDF_KERNEL void replace_strings_first_pass(cudf::column_device_view input,
-                                            cudf::column_device_view values_to_replace,
-                                            cudf::column_device_view replacement,
-                                            cudf::mutable_column_device_view offsets,
-                                            cudf::mutable_column_device_view indices,
-                                            cudf::bitmask_type* output_valid,
-                                            cudf::size_type* __restrict__ output_valid_count)
-{
-  cudf::size_type nrows = input.size();
-  auto tid              = cudf::detail::grid_1d::global_thread_id();
-  auto const stride     = cudf::detail::grid_1d::grid_stride();
-  uint32_t active_mask  = 0xffff'ffffu;
-  active_mask           = __ballot_sync(active_mask, tid < nrows);
-  auto const lane_id{threadIdx.x % cudf::detail::warp_size};
-  uint32_t valid_sum{0};
-
-  while (tid < nrows) {
-    auto const idx      = static_cast<cudf::size_type>(tid);
-    bool input_is_valid = true;
-
-    if (input_has_nulls) input_is_valid = input.is_valid_nocheck(idx);
-    bool output_is_valid = input_is_valid;
-
-    if (input_is_valid) {
-      int result               = get_new_string_value(idx, input, values_to_replace, replacement);
-      cudf::string_view output = (result == -1) ? input.element<cudf::string_view>(idx)
-                                                : replacement.element<cudf::string_view>(result);
-      offsets.data<cudf::size_type>()[idx] = output.size_bytes();
-      indices.data<cudf::size_type>()[idx] = result;
-      if (replacement_has_nulls && result != -1) {
-        output_is_valid = replacement.is_valid_nocheck(result);
-      }
-    } else {
-      offsets.data<cudf::size_type>()[idx] = 0;
-      indices.data<cudf::size_type>()[idx] = -1;
-    }
-
-    uint32_t bitmask = __ballot_sync(active_mask, output_is_valid);
-    if (0 == lane_id) {
-      output_valid[cudf::word_index(idx)] = bitmask;
-      valid_sum += __popc(bitmask);
-    }
-
-    tid += stride;
-    active_mask = __ballot_sync(active_mask, tid < nrows);
-  }
-
-  // Compute total valid count for this block and add it to global count
-  uint32_t block_valid_count = cudf::detail::single_lane_block_sum_reduce<BLOCK_SIZE, 0>(valid_sum);
-  // one thread computes and adds to output_valid_count
-  if (threadIdx.x == 0) {
-    atomicAdd(output_valid_count, static_cast<cudf::size_type>(block_valid_count));
-  }
-}
-
-/**
- * @brief Kernel which does the second pass of strings replace.
- *
- * It copies the string data needed from input and replacement into the new strings column chars
- * column.
- *
- * @param input The input column
- * @param replacement The replacement values
- * @param offsets The offsets column of the new strings column
- * @param strings The chars column of the new strings column
- * @param indices Temporary column used to store the replacement indices.
- */
-template <bool input_has_nulls, bool replacement_has_nulls>
-CUDF_KERNEL void replace_strings_second_pass(cudf::column_device_view input,
-                                             cudf::column_device_view replacement,
-                                             cudf::mutable_column_device_view offsets,
-                                             char* strings,
-                                             cudf::mutable_column_device_view indices)
-{
-  cudf::size_type nrows = input.size();
-  auto tid              = cudf::detail::grid_1d::global_thread_id();
-  auto const stride     = cudf::detail::grid_1d::grid_stride();
-
-  while (tid < nrows) {
-    auto const idx         = static_cast<cudf::size_type>(tid);
-    auto const replace_idx = indices.element<cudf::size_type>(idx);
-    bool output_is_valid   = true;
-    bool input_is_valid    = true;
-
-    if (input_has_nulls) {
-      input_is_valid  = input.is_valid_nocheck(idx);
-      output_is_valid = input_is_valid;
-    }
-    if (replacement_has_nulls && replace_idx != -1) {
-      output_is_valid = replacement.is_valid_nocheck(replace_idx);
-    }
-    if (output_is_valid) {
-      cudf::string_view output = (replace_idx == -1)
-                                   ? input.element<cudf::string_view>(idx)
-                                   : replacement.element<cudf::string_view>(replace_idx);
-      std::memcpy(
-        strings + offsets.data<cudf::size_type>()[idx], output.data(), output.size_bytes());
-    }
-
-    tid += stride;
-  }
-}
-
 /**
  * @brief Kernel that replaces elements from `output_data` given the following
  *        rule: replace all `values_to_replace[i]` in [values_to_replace_begin`,
@@ -375,79 +240,8 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::string_
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  rmm::device_scalar<cudf::size_type> valid_counter(0, stream);
-  cudf::size_type* valid_count = valid_counter.data();
-
-  auto replace_first  = replace_strings_first_pass<true, false>;
-  auto replace_second = replace_strings_second_pass<true, false>;
-  if (input_col.has_nulls()) {
-    if (replacement_values.has_nulls()) {
-      replace_first  = replace_strings_first_pass<true, true>;
-      replace_second = replace_strings_second_pass<true, true>;
-    }
-  } else {
-    if (replacement_values.has_nulls()) {
-      replace_first  = replace_strings_first_pass<false, true>;
-      replace_second = replace_strings_second_pass<false, true>;
-    } else {
-      replace_first  = replace_strings_first_pass<false, false>;
-      replace_second = replace_strings_second_pass<false, false>;
-    }
-  }
-
-  // Create new offsets column to use in kernel
-  std::unique_ptr<cudf::column> sizes =
-    cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
-                              input_col.size(),
-                              cudf::mask_state::UNALLOCATED,
-                              stream);
-  std::unique_ptr<cudf::column> indices =
-    cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
-                              input_col.size(),
-                              cudf::mask_state::UNALLOCATED,
-                              stream);
-
-  auto sizes_view   = sizes->mutable_view();
-  auto indices_view = indices->mutable_view();
-
-  auto device_in                = cudf::column_device_view::create(input_col, stream);
-  auto device_values_to_replace = cudf::column_device_view::create(values_to_replace, stream);
-  auto device_replacement       = cudf::column_device_view::create(replacement_values, stream);
-  auto device_sizes             = cudf::mutable_column_device_view::create(sizes_view, stream);
-  auto device_indices           = cudf::mutable_column_device_view::create(indices_view, stream);
-
-  rmm::device_buffer valid_bits =
-    cudf::detail::create_null_mask(input_col.size(), cudf::mask_state::UNINITIALIZED, stream, mr);
-
-  // Call first pass kernel to get sizes in offsets
-  cudf::detail::grid_1d grid{input_col.size(), BLOCK_SIZE, 1};
-  replace_first<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(
-    *device_in,
-    *device_values_to_replace,
-    *device_replacement,
-    *device_sizes,
-    *device_indices,
-    reinterpret_cast<cudf::bitmask_type*>(valid_bits.data()),
-    valid_count);
-
-  auto [offsets, bytes] = cudf::detail::make_offsets_child_column(
-    sizes_view.begin<cudf::size_type>(), sizes_view.end<cudf::size_type>(), stream, mr);
-  auto offsets_view   = offsets->mutable_view();
-  auto device_offsets = cudf::mutable_column_device_view::create(offsets_view, stream);
-
-  // Allocate chars array and output null mask
-  cudf::size_type null_count = input_col.size() - valid_counter.value(stream);
-  rmm::device_uvector<char> output_chars(bytes, stream, mr);
-  auto d_chars = output_chars.data();
-
-  replace_second<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(
-    *device_in, *device_replacement, *device_offsets, d_chars, *device_indices);
-
-  return cudf::make_strings_column(input_col.size(),
-                                   std::move(offsets),
-                                   output_chars.release(),
-                                   null_count,
-                                   std::move(valid_bits));
+  return cudf::strings::detail::find_and_replace_all(
+    input_col, values_to_replace, replacement_values, stream, mr);
 }
 
 template <>
diff --git a/cpp/src/strings/replace/find_replace.cu b/cpp/src/strings/replace/find_replace.cu
new file mode 100644
index 00000000000..818bfa58427
--- /dev/null
+++ b/cpp/src/strings/replace/find_replace.cu
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/replace.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/transform.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+namespace {
+struct find_replace_fn {
+  column_device_view d_input;
+  column_device_view d_values;
+  column_device_view d_replacements;
+
+  __device__ string_index_pair get_replacement(size_type idx)
+  {
+    if (d_replacements.is_null(idx)) { return string_index_pair{nullptr, 0}; }
+    auto const d_str = d_replacements.element<string_view>(idx);
+    return string_index_pair{d_str.data(), d_str.size_bytes()};
+  }
+
+  __device__ string_index_pair operator()(size_type idx)
+  {
+    if (d_input.is_null(idx)) { return string_index_pair{nullptr, 0}; }
+    auto const d_str = d_input.element<string_view>(idx);
+    // find d_str in d_values
+    // if found return corresponding replacement
+    // if not found, return d_str
+    auto const begin = thrust::counting_iterator<size_type>(0);
+    auto const end   = thrust::counting_iterator<size_type>(d_values.size());
+    auto const itr =
+      thrust::find_if(thrust::seq, begin, end, [d_values = d_values, d_str](size_type i) -> bool {
+        return d_str == d_values.element<string_view>(i);
+      });
+    return itr == end ? string_index_pair{d_str.data(), d_str.size_bytes()} : get_replacement(*itr);
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<cudf::column> find_and_replace_all(
+  cudf::strings_column_view const& input,
+  cudf::strings_column_view const& values_to_replace,
+  cudf::strings_column_view const& replacement_values,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  auto d_input             = cudf::column_device_view::create(input.parent(), stream);
+  auto d_values_to_replace = cudf::column_device_view::create(values_to_replace.parent(), stream);
+  auto d_replacements      = cudf::column_device_view::create(replacement_values.parent(), stream);
+
+  auto indices = rmm::device_uvector<string_index_pair>(input.size(), stream);
+
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::counting_iterator<size_type>(0),
+                    thrust::counting_iterator<size_type>(input.size()),
+                    indices.begin(),
+                    find_replace_fn{*d_input, *d_values_to_replace, *d_replacements});
+
+  return make_strings_column(indices.begin(), indices.end(), stream, mr);
+}
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp
index 8685e7300ba..613034efc12 100644
--- a/cpp/tests/replace/replace_tests.cpp
+++ b/cpp/tests/replace/replace_tests.cpp
@@ -97,9 +97,7 @@ TEST_F(ReplaceStringsTest, Strings)
   ASSERT_NO_THROW(result = cudf::find_and_replace_all(
                     input_wrapper, values_to_replace_wrapper, replacement_wrapper));
   std::vector<std::string> expected{"z", "b", "c", "d", "e", "f", "g", "h"};
-  std::vector<cudf::valid_type> ex_valid{1, 1, 1, 1, 1, 1, 1, 1};
-  cudf::test::strings_column_wrapper expected_wrapper{
-    expected.begin(), expected.end(), ex_valid.begin()};
+  cudf::test::strings_column_wrapper expected_wrapper{expected.begin(), expected.end()};
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected_wrapper);
 }
@@ -160,7 +158,6 @@ TEST_F(ReplaceStringsTest, StringsResultAllEmpty)
   std::vector<std::string> replacement{"a", ""};
   std::vector<cudf::valid_type> replacement_valid{1, 1};
   std::vector<std::string> expected{"", "", "", "", "", "", "", ""};
-  std::vector<cudf::valid_type> ex_valid{1, 1, 1, 1, 1, 1, 1, 1};
   cudf::test::strings_column_wrapper input_wrapper{input.begin(), input.end()};
   cudf::test::strings_column_wrapper values_to_replace_wrapper{values_to_replace.begin(),
                                                                values_to_replace.end()};
@@ -170,8 +167,7 @@ TEST_F(ReplaceStringsTest, StringsResultAllEmpty)
   std::unique_ptr<cudf::column> result;
   ASSERT_NO_THROW(result = cudf::find_and_replace_all(
                     input_wrapper, values_to_replace_wrapper, replacement_wrapper));
-  cudf::test::strings_column_wrapper expected_wrapper{
-    expected.begin(), expected.end(), ex_valid.begin()};
+  cudf::test::strings_column_wrapper expected_wrapper{expected.begin(), expected.end()};
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected_wrapper);
 }

From dda3f316cecd2cc23f97cd4fa9e44ec93efe5395 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Sat, 23 Mar 2024 00:09:11 +0000
Subject: [PATCH 010/272] Fix arrow-based round trip of empty dataframes
 (#15373)

When materializing range indices we were not previously creating the correct metadata. So do that.

While here, tidy up a few corner cases around creating range indices when constructing empty data frames.

- Closes #12243
- Closes #14159

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15373
---
 python/cudf/cudf/_lib/utils.pyx               | 40 +++++++------
 python/cudf/cudf/core/dataframe.py            | 43 ++++++++------
 .../tests/dataframe/test_io_serialization.py  | 59 ++++++++++++++++++-
 python/cudf/cudf/tests/test_parquet.py        | 14 ++++-
 4 files changed, 115 insertions(+), 41 deletions(-)

diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index b6637e9df08..0afecb215e4 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -59,7 +59,7 @@ cpdef generate_pandas_metadata(table, index):
     types = []
     index_levels = []
     index_descriptors = []
-
+    columns_to_convert = list(table._columns)
     # Columns
     for name, col in table._data.items():
         if cudf.get_option("mode.pandas_compatible"):
@@ -90,6 +90,7 @@ cpdef generate_pandas_metadata(table, index):
                 types.append(np_to_pa_dtype(col.dtype))
 
     # Indexes
+    materialize_index = False
     if index is not False:
         for level, name in enumerate(table._index.names):
             if isinstance(table._index, cudf.core.multiindex.MultiIndex):
@@ -107,22 +108,26 @@ cpdef generate_pandas_metadata(table, index):
                         "step": table.index.step,
                     }
                 else:
+                    materialize_index = True
                     # When `index=True`, RangeIndex needs to be materialized.
                     materialized_idx = cudf.Index(idx._values, name=idx.name)
-                    descr = \
-                        _index_level_name(
-                            index_name=materialized_idx.name,
-                            level=level,
-                            column_names=col_names
-                        )
-                    index_levels.append(materialized_idx)
-            else:
-                descr = \
-                    _index_level_name(
-                        index_name=idx.name,
+                    descr = _index_level_name(
+                        index_name=materialized_idx.name,
                         level=level,
                         column_names=col_names
                     )
+                    index_levels.append(materialized_idx)
+                    columns_to_convert.append(materialized_idx._values)
+                    col_names.append(descr)
+                    types.append(np_to_pa_dtype(materialized_idx.dtype))
+            else:
+                descr = _index_level_name(
+                    index_name=idx.name,
+                    level=level,
+                    column_names=col_names
+                )
+                columns_to_convert.append(idx._values)
+                col_names.append(descr)
                 if isinstance(idx.dtype, cudf.CategoricalDtype):
                     raise ValueError(
                         "'category' column dtypes are currently not "
@@ -141,17 +146,16 @@ cpdef generate_pandas_metadata(table, index):
                         types.append(np_to_pa_dtype(idx.dtype))
 
                 index_levels.append(idx)
-            col_names.append(name)
             index_descriptors.append(descr)
 
+    df_meta = table.head(0)
+    if materialize_index:
+        df_meta.index = df_meta.index._as_int_index()
     metadata = pa.pandas_compat.construct_metadata(
-        columns_to_convert=[
-            col
-            for col in table._columns
-        ],
+        columns_to_convert=columns_to_convert,
         # It is OKAY to do `.head(0).to_pandas()` because
         # this method will extract `.columns` metadata only
-        df=table.head(0).to_pandas(),
+        df=df_meta.to_pandas(),
         column_names=col_names,
         index_levels=index_levels,
         index_descriptors=index_descriptors,
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index da0a969b70c..2a4f93c1716 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5485,14 +5485,18 @@ def from_arrow(cls, table):
         return out
 
     @_cudf_nvtx_annotate
-    def to_arrow(self, preserve_index=True):
+    def to_arrow(self, preserve_index=None):
         """
         Convert to a PyArrow Table.
 
         Parameters
         ----------
-        preserve_index : bool, default True
-            whether index column and its meta data needs to be saved or not
+        preserve_index : bool, optional
+            whether index column and its meta data needs to be saved
+            or not. The default of None will store the index as a
+            column, except for a RangeIndex which is stored as
+            metadata only. Setting preserve_index to True will force
+            a RangeIndex to be materialized.
 
         Returns
         -------
@@ -5523,34 +5527,35 @@ def to_arrow(self, preserve_index=True):
 
         data = self.copy(deep=False)
         index_descr = []
-        if preserve_index:
-            if isinstance(self.index, cudf.RangeIndex):
+        write_index = preserve_index is not False
+        keep_range_index = write_index and preserve_index is None
+        index = self.index
+        if write_index:
+            if isinstance(index, cudf.RangeIndex) and keep_range_index:
                 descr = {
                     "kind": "range",
-                    "name": self.index.name,
-                    "start": self.index._start,
-                    "stop": self.index._stop,
+                    "name": index.name,
+                    "start": index._start,
+                    "stop": index._stop,
                     "step": 1,
                 }
             else:
-                if isinstance(self.index, MultiIndex):
+                if isinstance(index, cudf.RangeIndex):
+                    index = index._as_int_index()
+                    index.name = "__index_level_0__"
+                if isinstance(index, MultiIndex):
                     gen_names = tuple(
-                        f"level_{i}"
-                        for i, _ in enumerate(self.index._data.names)
+                        f"level_{i}" for i, _ in enumerate(index._data.names)
                     )
                 else:
                     gen_names = (
-                        self.index.names
-                        if self.index.name is not None
-                        else ("index",)
+                        index.names if index.name is not None else ("index",)
                     )
-                for gen_name, col_name in zip(
-                    gen_names, self.index._data.names
-                ):
+                for gen_name, col_name in zip(gen_names, index._data.names):
                     data._insert(
                         data.shape[1],
                         gen_name,
-                        self.index._data[col_name],
+                        index._data[col_name],
                     )
                 descr = gen_names[0]
             index_descr.append(descr)
@@ -5560,7 +5565,7 @@ def to_arrow(self, preserve_index=True):
             columns_to_convert=[self[col] for col in self._data.names],
             df=self,
             column_names=out.schema.names,
-            index_levels=[self.index],
+            index_levels=[index],
             index_descriptors=index_descr,
             preserve_index=preserve_index,
             types=out.schema.types,
diff --git a/python/cudf/cudf/tests/dataframe/test_io_serialization.py b/python/cudf/cudf/tests/dataframe/test_io_serialization.py
index 06777c8e6af..ad81609470c 100644
--- a/python/cudf/cudf/tests/dataframe/test_io_serialization.py
+++ b/python/cudf/cudf/tests/dataframe/test_io_serialization.py
@@ -1 +1,58 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+import contextlib
+from io import BytesIO
+
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
+import pytest
+
+import cudf
+from cudf.testing._utils import assert_eq
+
+
+@pytest.mark.parametrize(
+    "index",
+    [range(1, 11), list(range(1, 11)), range(1, 11)[::2]],
+    ids=["RangeIndex", "IntIndex", "StridedRange"],
+)
+@pytest.mark.parametrize("write_index", [False, True, None])
+@pytest.mark.parametrize("empty", [False, True], ids=["nonempty", "empty"])
+def test_dataframe_parquet_roundtrip(index, write_index, empty):
+    if empty:
+        data = {}
+    else:
+        data = {"a": [i * 2 for i in index]}
+    df = cudf.DataFrame(data=data, index=index)
+    pf = pd.DataFrame(data=data, index=index)
+    gpu_buf = BytesIO()
+    cpu_buf = BytesIO()
+
+    df.to_parquet(gpu_buf, index=write_index)
+    pf.to_parquet(cpu_buf, index=write_index)
+    gpu_table = pq.read_table(gpu_buf)
+    cpu_table = pq.read_table(cpu_buf)
+    metadata_equal = (
+        gpu_table.schema.pandas_metadata == cpu_table.schema.pandas_metadata
+    )
+    if empty and write_index is not False:
+        # https://github.com/rapidsai/cudf/issues/15372
+        ctx = pytest.raises(AssertionError)
+    else:
+        ctx = contextlib.nullcontext()
+    with ctx:
+        assert metadata_equal
+
+    gpu_read = cudf.read_parquet(gpu_buf)
+    cpu_read = cudf.read_parquet(cpu_buf)
+    with ctx:
+        assert_eq(gpu_read, cpu_read)
+
+
+@pytest.mark.parametrize("preserve_index", [False, True, None])
+def test_dataframe_to_arrow_preserve_index(preserve_index):
+    df = cudf.DataFrame({"x": ["cat", "dog"] * 5})
+    pf = df.to_pandas()
+    expect = pa.Table.from_pandas(pf, preserve_index=preserve_index).schema
+    got = df.to_arrow(preserve_index=preserve_index).schema
+    assert expect == got
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 8b72fe84359..9ba71b28637 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2442,9 +2442,17 @@ def test_parquet_index(pdf, index):
     run_parquet_index(pdf, index)
 
 
-@pytest.mark.parametrize("index", [None, True])
-@pytest.mark.xfail(
-    reason="https://github.com/rapidsai/cudf/issues/12243",
+@pytest.mark.parametrize(
+    "index",
+    [
+        pytest.param(
+            None,
+            marks=pytest.mark.xfail(
+                reason="https://github.com/apache/arrow/issues/40743"
+            ),
+        ),
+        True,
+    ],
 )
 def test_parquet_index_empty(index):
     pdf = pd.DataFrame(index=pd.RangeIndex(0, 10, 1))

From 933e32ab9ad8e5057282c48129ddbd745c538967 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 25 Mar 2024 11:47:51 -0500
Subject: [PATCH 011/272] Update udf_cpp to use rapids_cpm_cccl. (#15331)

This PR updates the `udf_cpp` target to use `rapids_cpm_cccl`. The previous `rapids_cpm_libcudacxx` has been deprecated.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/15331
---
 python/cudf/udf_cpp/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cudf/udf_cpp/CMakeLists.txt b/python/cudf/udf_cpp/CMakeLists.txt
index 57b52559f00..fe7f9d0b00d 100644
--- a/python/cudf/udf_cpp/CMakeLists.txt
+++ b/python/cudf/udf_cpp/CMakeLists.txt
@@ -26,8 +26,8 @@ rapids_find_package(
   INSTALL_EXPORT_SET udf-exports
 )
 
-include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
-rapids_cpm_libcudacxx(BUILD_EXPORT_SET udf-exports INSTALL_EXPORT_SET udf-exports)
+include(${rapids-cmake-dir}/cpm/cccl.cmake)
+rapids_cpm_cccl(BUILD_EXPORT_SET udf-exports INSTALL_EXPORT_SET udf-exports)
 
 add_library(cudf_strings_udf SHARED strings/src/strings/udf/udf_apis.cu)
 target_include_directories(

From a7ceedecbbfb3159520fc0d5aeaea4db9d2e4327 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Tue, 26 Mar 2024 17:24:41 -0700
Subject: [PATCH 012/272] Use logical types in Parquet reader (#15365)

Closes #15224. Now use logical type exclusively in the reader rather than the deprecated converted type.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - MithunR (https://github.com/mythrocks)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15365
---
 cpp/src/io/parquet/decode_fixed.cu           |   4 +-
 cpp/src/io/parquet/decode_preprocess.cu      |   2 +-
 cpp/src/io/parquet/page_data.cu              |  18 +-
 cpp/src/io/parquet/page_data.cuh             |   3 +-
 cpp/src/io/parquet/page_decode.cuh           |  58 ++---
 cpp/src/io/parquet/page_hdr.cu               |   4 +-
 cpp/src/io/parquet/page_string_decode.cu     |   4 +-
 cpp/src/io/parquet/parquet_gpu.hpp           |  41 ++--
 cpp/src/io/parquet/reader_impl.cpp           |  16 +-
 cpp/src/io/parquet/reader_impl_chunking.cu   |  49 ++---
 cpp/src/io/parquet/reader_impl_helpers.cpp   | 210 ++++++++++---------
 cpp/src/io/parquet/reader_impl_preprocess.cu |   4 +-
 12 files changed, 220 insertions(+), 193 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 062363db503..945a7dcb4c6 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -165,7 +165,7 @@ __device__ inline void gpuDecodeValues(
   constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
 
   PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
-  int const dtype                          = s->col.data_type & 7;
+  int const dtype                          = s->col.physical_type;
 
   // decode values
   int pos = start;
@@ -187,7 +187,7 @@ __device__ inline void gpuDecodeValues(
       uint32_t dtype_len = s->dtype_len;
       void* dst =
         nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
-      if (s->col.converted_type == DECIMAL) {
+      if (s->col.logical_type.has_value() && s->col.logical_type->type == LogicalType::DECIMAL) {
         switch (dtype) {
           case INT32: gpuOutputFast(s, sb, src_pos, static_cast<uint32_t*>(dst)); break;
           case INT64: gpuOutputFast(s, sb, src_pos, static_cast<uint2*>(dst)); break;
diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu
index 8f772636c7e..e49801e6172 100644
--- a/cpp/src/io/parquet/decode_preprocess.cu
+++ b/cpp/src/io/parquet/decode_preprocess.cu
@@ -389,7 +389,7 @@ CUDF_KERNEL void __launch_bounds__(preprocess_block_size)
   // we only need to preprocess hierarchies with repetition in them (ie, hierarchies
   // containing lists anywhere within).
   compute_string_sizes =
-    compute_string_sizes && ((s->col.data_type & 7) == BYTE_ARRAY && s->dtype_len != 4);
+    compute_string_sizes && s->col.physical_type == BYTE_ARRAY && !s->col.is_strings_to_cat;
 
   // early out optimizations:
 
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 261e04e3f19..62ce5b9f9a5 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -77,7 +77,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   if (s->dict_base) {
     out_thread0 = (s->dict_bits > 0) ? 64 : 32;
   } else {
-    switch (s->col.data_type & 7) {
+    switch (s->col.physical_type) {
       case BOOLEAN: [[fallthrough]];
       case BYTE_ARRAY: [[fallthrough]];
       case FIXED_LEN_BYTE_ARRAY: out_thread0 = 64; break;
@@ -123,16 +123,16 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
       // be needed in the other DecodeXXX kernels.
       if (s->dict_base) {
         src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, t & 0x1f).first;
-      } else if ((s->col.data_type & 7) == BOOLEAN) {
+      } else if (s->col.physical_type == BOOLEAN) {
         src_target_pos = gpuDecodeRleBooleans(s, sb, src_target_pos, t & 0x1f);
-      } else if ((s->col.data_type & 7) == BYTE_ARRAY or
-                 (s->col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
+      } else if (s->col.physical_type == BYTE_ARRAY or
+                 s->col.physical_type == FIXED_LEN_BYTE_ARRAY) {
         gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f);
       }
       if (t == 32) { s->dict_pos = src_target_pos; }
     } else {
       // WARP1..WARP3: Decode values
-      int const dtype = s->col.data_type & 7;
+      int const dtype = s->col.physical_type;
       src_pos += t - out_thread0;
 
       // the position in the output column/buffer
@@ -166,10 +166,12 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
         uint32_t dtype_len = s->dtype_len;
         void* dst =
           nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
+        auto const is_decimal =
+          s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL;
         if (dtype == BYTE_ARRAY) {
-          if (s->col.converted_type == DECIMAL) {
+          if (is_decimal) {
             auto const [ptr, len]        = gpuGetStringData(s, sb, val_src_pos);
-            auto const decimal_precision = s->col.decimal_precision;
+            auto const decimal_precision = s->col.logical_type->precision();
             if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
               gpuOutputByteArrayAsInt(ptr, len, static_cast<int32_t*>(dst));
             } else if (decimal_precision <= MAX_DECIMAL64_PRECISION) {
@@ -182,7 +184,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
           }
         } else if (dtype == BOOLEAN) {
           gpuOutputBoolean(sb, val_src_pos, static_cast<uint8_t*>(dst));
-        } else if (s->col.converted_type == DECIMAL) {
+        } else if (is_decimal) {
           switch (dtype) {
             case INT32: gpuOutputFast(s, sb, val_src_pos, static_cast<uint32_t*>(dst)); break;
             case INT64: gpuOutputFast(s, sb, val_src_pos, static_cast<uint2*>(dst)); break;
diff --git a/cpp/src/io/parquet/page_data.cuh b/cpp/src/io/parquet/page_data.cuh
index f0fa7d814cf..df8d801d66c 100644
--- a/cpp/src/io/parquet/page_data.cuh
+++ b/cpp/src/io/parquet/page_data.cuh
@@ -34,8 +34,7 @@ template <typename state_buf>
 inline __device__ void gpuOutputString(page_state_s* s, state_buf* sb, int src_pos, void* dstv)
 {
   auto [ptr, len] = gpuGetStringData(s, sb, src_pos);
-  // make sure to only hash `BYTE_ARRAY` when specified with the output type size
-  if (s->dtype_len == 4 and (s->col.data_type & 7) == BYTE_ARRAY) {
+  if (s->col.is_strings_to_cat and s->col.physical_type == BYTE_ARRAY) {
     // Output hash. This hash value is used if the option to convert strings to
     // categoricals is enabled. The seed value is chosen arbitrarily.
     uint32_t constexpr hash_seed = 33;
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index a081ee4e03f..fa1de5f301d 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -441,7 +441,7 @@ gpuInitStringDescriptors(page_state_s* s, [[maybe_unused]] state_buf* sb, int ta
 
     while (pos < target_pos) {
       int len = 0;
-      if ((s->col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
+      if (s->col.physical_type == FIXED_LEN_BYTE_ARRAY) {
         if (k < dict_size) { len = s->dtype_len_in; }
       } else {
         if (k + 4 <= dict_size) {
@@ -1144,11 +1144,11 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
     if (s->page.num_input_values > 0) {
       uint8_t* cur = s->page.page_data;
       uint8_t* end = cur + s->page.uncompressed_page_size;
-
-      uint32_t dtype_len_out = s->col.data_type >> 3;
-      s->ts_scale            = 0;
+      s->ts_scale  = 0;
       // Validate data type
-      auto const data_type = s->col.data_type & 7;
+      auto const data_type = s->col.physical_type;
+      auto const is_decimal =
+        s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL;
       switch (data_type) {
         case BOOLEAN:
           s->dtype_len = 1;  // Boolean are stored as 1 byte on the output
@@ -1159,13 +1159,15 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           if (s->col.ts_clock_rate) {
             int32_t units = 0;
             // Duration types are not included because no scaling is done when reading
-            if (s->col.converted_type == TIMESTAMP_MILLIS) {
-              units = cudf::timestamp_ms::period::den;
-            } else if (s->col.converted_type == TIMESTAMP_MICROS) {
-              units = cudf::timestamp_us::period::den;
-            } else if (s->col.logical_type.has_value() and
-                       s->col.logical_type->is_timestamp_nanos()) {
-              units = cudf::timestamp_ns::period::den;
+            if (s->col.logical_type.has_value()) {
+              auto const& lt = s->col.logical_type.value();
+              if (lt.is_timestamp_millis()) {
+                units = cudf::timestamp_ms::period::den;
+              } else if (lt.is_timestamp_micros()) {
+                units = cudf::timestamp_us::period::den;
+              } else if (lt.is_timestamp_nanos()) {
+                units = cudf::timestamp_ns::period::den;
+              }
             }
             if (units and units != s->col.ts_clock_rate) {
               s->ts_scale = (s->col.ts_clock_rate < units) ? -(units / s->col.ts_clock_rate)
@@ -1176,8 +1178,8 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
         case DOUBLE: s->dtype_len = 8; break;
         case INT96: s->dtype_len = 12; break;
         case BYTE_ARRAY:
-          if (s->col.converted_type == DECIMAL) {
-            auto const decimal_precision = s->col.decimal_precision;
+          if (is_decimal) {
+            auto const decimal_precision = s->col.logical_type->precision();
             s->dtype_len                 = [decimal_precision]() {
               if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
                 return sizeof(int32_t);
@@ -1192,14 +1194,14 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           }
           break;
         default:  // FIXED_LEN_BYTE_ARRAY:
-          s->dtype_len = dtype_len_out;
+          s->dtype_len = s->col.type_length;
           if (s->dtype_len <= 0) { s->set_error_code(decode_error::INVALID_DATA_TYPE); }
           break;
       }
       // Special check for downconversions
       s->dtype_len_in = s->dtype_len;
       if (data_type == FIXED_LEN_BYTE_ARRAY) {
-        if (s->col.converted_type == DECIMAL) {
+        if (is_decimal) {
           s->dtype_len = [dtype_len = s->dtype_len]() {
             if (dtype_len <= sizeof(int32_t)) {
               return sizeof(int32_t);
@@ -1213,17 +1215,17 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           s->dtype_len = sizeof(string_index_pair);
         }
       } else if (data_type == INT32) {
-        if (dtype_len_out == 1) {
-          // INT8 output
-          s->dtype_len = 1;
-        } else if (dtype_len_out == 2) {
-          // INT16 output
-          s->dtype_len = 2;
-        } else if (s->col.converted_type == TIME_MILLIS) {
-          // INT64 output
-          s->dtype_len = 8;
+        // check for smaller bitwidths
+        if (s->col.logical_type.has_value()) {
+          auto const& lt = s->col.logical_type.value();
+          if (lt.type == LogicalType::INTEGER) {
+            s->dtype_len = lt.bit_width() / 8;
+          } else if (lt.is_time_millis()) {
+            // cudf outputs as INT64
+            s->dtype_len = 8;
+          }
         }
-      } else if (data_type == BYTE_ARRAY && dtype_len_out == 4) {
+      } else if (data_type == BYTE_ARRAY && s->col.is_strings_to_cat) {
         s->dtype_len = 4;  // HASH32 output
       } else if (data_type == INT96) {
         s->dtype_len = 8;  // Convert to 64-bit timestamp
@@ -1298,7 +1300,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
         case Encoding::PLAIN_DICTIONARY:
         case Encoding::RLE_DICTIONARY:
           // RLE-packed dictionary indices, first byte indicates index length in bits
-          if (((s->col.data_type & 7) == BYTE_ARRAY) && (s->col.str_dict_index)) {
+          if (s->col.physical_type == BYTE_ARRAY && s->col.str_dict_index != nullptr) {
             // String dictionary: use index
             s->dict_base = reinterpret_cast<uint8_t const*>(s->col.str_dict_index);
             s->dict_size = s->col.dict_page->num_input_values * sizeof(string_index_pair);
@@ -1316,7 +1318,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
         case Encoding::PLAIN:
           s->dict_size = static_cast<int32_t>(end - cur);
           s->dict_val  = 0;
-          if ((s->col.data_type & 7) == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; }
+          if (s->col.physical_type == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; }
           break;
         case Encoding::RLE: {
           // first 4 bytes are length of RLE data
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 4a50c7445b3..07e03460ecb 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -147,12 +147,12 @@ __device__ inline bool is_nested(ColumnChunkDesc const& chunk)
 
 __device__ inline bool is_byte_array(ColumnChunkDesc const& chunk)
 {
-  return (chunk.data_type & 7) == BYTE_ARRAY;
+  return chunk.physical_type == BYTE_ARRAY;
 }
 
 __device__ inline bool is_boolean(ColumnChunkDesc const& chunk)
 {
-  return (chunk.data_type & 7) == BOOLEAN;
+  return chunk.physical_type == BOOLEAN;
 }
 
 /**
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index d8b1c1cc046..6f96d4dd1cf 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -689,7 +689,7 @@ CUDF_KERNEL void __launch_bounds__(delta_preproc_block_size) gpuComputeDeltaPage
   auto const start_value = pp->start_val;
 
   // if data size is known, can short circuit here
-  if ((chunks[pp->chunk_idx].data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
+  if (chunks[pp->chunk_idx].physical_type == FIXED_LEN_BYTE_ARRAY) {
     if (t == 0) {
       pp->str_bytes = pp->num_valids * s->dtype_len_in;
 
@@ -881,7 +881,7 @@ CUDF_KERNEL void __launch_bounds__(preprocess_block_size) gpuComputePageStringSi
   auto const& col  = s->col;
   size_t str_bytes = 0;
   // short circuit for FIXED_LEN_BYTE_ARRAY
-  if ((col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
+  if (col.physical_type == FIXED_LEN_BYTE_ARRAY) {
     str_bytes = pp->num_valids * s->dtype_len_in;
   } else {
     // now process string info in the range [start_value, end_value)
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 82ccb2b314a..200a8ec9ddb 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -370,8 +370,8 @@ struct ColumnChunkDesc {
   explicit ColumnChunkDesc(size_t compressed_size_,
                            uint8_t* compressed_data_,
                            size_t num_values_,
-                           uint16_t datatype_,
-                           uint16_t datatype_length_,
+                           Type datatype_,
+                           int32_t datatype_length_,
                            size_t start_row_,
                            uint32_t num_rows_,
                            int16_t max_definition_level_,
@@ -379,15 +379,14 @@ struct ColumnChunkDesc {
                            int16_t max_nesting_depth_,
                            uint8_t def_level_bits_,
                            uint8_t rep_level_bits_,
-                           int8_t codec_,
-                           int8_t converted_type_,
+                           Compression codec_,
                            thrust::optional<LogicalType> logical_type_,
-                           int8_t decimal_precision_,
                            int32_t ts_clock_rate_,
                            int32_t src_col_index_,
                            int32_t src_col_schema_,
                            column_chunk_info const* chunk_info_,
-                           float list_bytes_per_row_est_)
+                           float list_bytes_per_row_est_,
+                           bool strings_to_categorical_)
     : compressed_data(compressed_data_),
       compressed_size(compressed_size_),
       num_values(num_values_),
@@ -395,7 +394,8 @@ struct ColumnChunkDesc {
       num_rows(num_rows_),
       max_level{max_definition_level_, max_repetition_level_},
       max_nesting_depth{max_nesting_depth_},
-      data_type(datatype_ | (datatype_length_ << 3)),
+      type_length(datatype_length_),
+      physical_type(datatype_),
       level_bits{def_level_bits_, rep_level_bits_},
       num_data_pages(0),
       num_dict_pages(0),
@@ -405,14 +405,13 @@ struct ColumnChunkDesc {
       column_data_base{nullptr},
       column_string_base{nullptr},
       codec(codec_),
-      converted_type(converted_type_),
       logical_type(logical_type_),
-      decimal_precision(decimal_precision_),
       ts_clock_rate(ts_clock_rate_),
       src_col_index(src_col_index_),
       src_col_schema(src_col_schema_),
       h_chunk_info(chunk_info_),
-      list_bytes_per_row_est(list_bytes_per_row_est_)
+      list_bytes_per_row_est(list_bytes_per_row_est_),
+      is_strings_to_cat(strings_to_categorical_)
   {
   }
 
@@ -423,7 +422,8 @@ struct ColumnChunkDesc {
   uint32_t num_rows{};               // number of rows in this chunk
   int16_t max_level[level_type::NUM_LEVEL_TYPES]{};  // max definition/repetition level
   int16_t max_nesting_depth{};                       // max nesting depth of the output
-  uint16_t data_type{};  // basic column data type, ((type_length << 3) | // parquet::Type)
+  int32_t type_length{};                             // type length from schema (for FLBA only)
+  Type physical_type{};                              // parquet physical data type
   uint8_t
     level_bits[level_type::NUM_LEVEL_TYPES]{};  // bits to encode max definition/repetition levels
   int32_t num_data_pages{};                     // number of data pages
@@ -433,10 +433,8 @@ struct ColumnChunkDesc {
   bitmask_type** valid_map_base{};               // base pointers of valid bit map for this column
   void** column_data_base{};                     // base pointers of column data
   void** column_string_base{};                   // base pointers of column string data
-  int8_t codec{};                                // compressed codec enum
-  int8_t converted_type{};                       // converted type enum
+  Compression codec{};                           // compressed codec enum
   thrust::optional<LogicalType> logical_type{};  // logical type
-  int8_t decimal_precision{};                    // Decimal precision
   int32_t ts_clock_rate{};  // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
 
   int32_t src_col_index{};   // my input column index
@@ -446,6 +444,8 @@ struct ColumnChunkDesc {
   column_chunk_info const* h_chunk_info{};
 
   float list_bytes_per_row_est{};  // for LIST columns, an estimate on number of bytes per row
+
+  bool is_strings_to_cat{};  // convert strings to hashes
 };
 
 /**
@@ -615,11 +615,16 @@ struct EncPage {
  */
 constexpr bool is_string_col(ColumnChunkDesc const& chunk)
 {
-  auto const not_converted_to_decimal = chunk.converted_type != DECIMAL;
+  // return true for non-hashed byte_array and fixed_len_byte_array that isn't representing
+  // a decimal.
+  if (chunk.logical_type.has_value() and chunk.logical_type->type == LogicalType::DECIMAL) {
+    return false;
+  }
+
   auto const non_hashed_byte_array =
-    (chunk.data_type & 7) == BYTE_ARRAY and (chunk.data_type >> 3) != 4;
-  auto const fixed_len_byte_array = (chunk.data_type & 7) == FIXED_LEN_BYTE_ARRAY;
-  return not_converted_to_decimal and (non_hashed_byte_array or fixed_len_byte_array);
+    chunk.physical_type == BYTE_ARRAY and not chunk.is_strings_to_cat;
+  auto const fixed_len_byte_array = chunk.physical_type == FIXED_LEN_BYTE_ARRAY;
+  return non_hashed_byte_array or fixed_len_byte_array;
 }
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 8112328d962..2356878f6ba 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -28,6 +28,19 @@
 
 namespace cudf::io::parquet::detail {
 
+namespace {
+// Tests the passed in logical type for a FIXED_LENGTH_BYTE_ARRAY column to see if it should
+// be treated as a string. Currently the only logical type that has special handling is DECIMAL.
+// Other valid types in the future would be UUID (still treated as string) and FLOAT16 (which
+// for now would also be treated as a string).
+inline bool is_treat_fixed_length_as_string(thrust::optional<LogicalType> const& logical_type)
+{
+  if (!logical_type.has_value()) { return true; }
+  return logical_type->type != LogicalType::DECIMAL;
+}
+
+}  // namespace
+
 void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_rows, size_t num_rows)
 {
   auto& pass    = *_pass_itm_data;
@@ -66,7 +79,8 @@ void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_row
     // TODO: we could probably dummy up size stats for FLBA data since we know the width
     auto const has_flba =
       std::any_of(pass.chunks.begin(), pass.chunks.end(), [](auto const& chunk) {
-        return (chunk.data_type & 7) == FIXED_LEN_BYTE_ARRAY && chunk.converted_type != DECIMAL;
+        return chunk.physical_type == FIXED_LEN_BYTE_ARRAY and
+               is_treat_fixed_length_as_string(chunk.logical_type);
       });
 
     if (!_has_page_index || uses_custom_row_bounds || has_flba) {
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 5c387147e4b..912f53a8277 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -364,33 +364,28 @@ int64_t find_next_split(int64_t cur_pos,
 /**
  * @brief Converts cuDF units to Parquet units.
  *
- * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type.
+ * @return A tuple of Parquet clock rate and Parquet decimal type.
  */
-[[nodiscard]] std::tuple<int32_t, int32_t, int8_t> conversion_info(
+[[nodiscard]] std::tuple<int32_t, thrust::optional<LogicalType>> conversion_info(
   type_id column_type_id,
   type_id timestamp_type_id,
   Type physical,
-  thrust::optional<ConvertedType> converted,
-  int32_t length)
+  thrust::optional<LogicalType> logical_type)
 {
-  int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0;
-  int32_t clock_rate = 0;
-  if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) {
-    type_width = 1;  // I32 -> I8
-  } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) {
-    type_width = 2;  // I32 -> I16
-  } else if (column_type_id == type_id::INT32) {
-    type_width = 4;  // str -> hash32
-  } else if (is_chrono(data_type{column_type_id})) {
-    clock_rate = to_clockrate(timestamp_type_id);
+  int32_t const clock_rate =
+    is_chrono(data_type{column_type_id}) ? to_clockrate(timestamp_type_id) : 0;
+
+  // TODO(ets): this is leftover from the original code, but will we ever output decimal as
+  // anything but fixed point?
+  if (logical_type.has_value() and logical_type->type == LogicalType::DECIMAL) {
+    // if decimal but not outputting as float or decimal, then convert to no logical type
+    if (column_type_id != type_id::FLOAT64 and
+        not cudf::is_fixed_point(data_type{column_type_id})) {
+      return std::make_tuple(clock_rate, thrust::nullopt);
+    }
   }
 
-  int8_t converted_type = converted.value_or(UNKNOWN);
-  if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 &&
-      not cudf::is_fixed_point(data_type{column_type_id})) {
-    converted_type = UNKNOWN;  // Not converting to float64 or decimal
-  }
-  return std::make_tuple(type_width, clock_rate, converted_type);
+  return std::make_tuple(clock_rate, std::move(logical_type));
 }
 
 /**
@@ -1515,12 +1510,11 @@ void reader::impl::create_global_chunk_info()
       auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
       auto& schema   = _metadata->get_schema(col.schema_idx);
 
-      auto [type_width, clock_rate, converted_type] =
+      auto [clock_rate, logical_type] =
         conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()),
                         _timestamp_type.id(),
                         schema.type,
-                        schema.converted_type,
-                        schema.type_length);
+                        schema.logical_type);
 
       // for lists, estimate the number of bytes per row. this is used by the subpass reader to
       // determine where to split the decompression boundaries
@@ -1538,7 +1532,7 @@ void reader::impl::create_global_chunk_info()
                                        nullptr,
                                        col_meta.num_values,
                                        schema.type,
-                                       type_width,
+                                       schema.type_length,
                                        row_group_start,
                                        row_group_rows,
                                        schema.max_definition_level,
@@ -1547,14 +1541,13 @@ void reader::impl::create_global_chunk_info()
                                        required_bits(schema.max_definition_level),
                                        required_bits(schema.max_repetition_level),
                                        col_meta.codec,
-                                       converted_type,
-                                       schema.logical_type,
-                                       schema.decimal_precision,
+                                       logical_type,
                                        clock_rate,
                                        i,
                                        col.schema_idx,
                                        chunk_info,
-                                       list_bytes_per_row_est));
+                                       list_bytes_per_row_est,
+                                       schema.type == BYTE_ARRAY and _strings_to_categorical));
     }
 
     remaining_rows -= row_group_rows;
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 776caa99ac9..bfc69264ab2 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -16,6 +16,7 @@
 
 #include "reader_impl_helpers.hpp"
 
+#include "io/parquet/parquet.hpp"
 #include "io/utilities/row_selection.hpp"
 
 #include <numeric>
@@ -25,44 +26,35 @@ namespace cudf::io::parquet::detail {
 
 namespace {
 
-ConvertedType logical_type_to_converted_type(thrust::optional<LogicalType> const& logical)
+thrust::optional<LogicalType> converted_to_logical_type(SchemaElement const& schema)
 {
-  if (not logical.has_value()) { return UNKNOWN; }
-  switch (logical->type) {
-    case LogicalType::STRING: return UTF8;
-    case LogicalType::MAP: return MAP;
-    case LogicalType::LIST: return LIST;
-    case LogicalType::ENUM: return ENUM;
-    case LogicalType::DECIMAL: return DECIMAL;  // TODO use decimal scale/precision
-    case LogicalType::DATE: return DATE;
-    case LogicalType::TIME:
-      if (logical->is_time_millis()) {
-        return TIME_MILLIS;
-      } else if (logical->is_time_micros()) {
-        return TIME_MICROS;
-      }
-      break;
-    case LogicalType::TIMESTAMP:
-      if (logical->is_timestamp_millis()) {
-        return TIMESTAMP_MILLIS;
-      } else if (logical->is_timestamp_micros()) {
-        return TIMESTAMP_MICROS;
-      }
-      break;
-    case LogicalType::INTEGER:
-      switch (logical->bit_width()) {
-        case 8: return logical->is_signed() ? INT_8 : UINT_8;
-        case 16: return logical->is_signed() ? INT_16 : UINT_16;
-        case 32: return logical->is_signed() ? INT_32 : UINT_32;
-        case 64: return logical->is_signed() ? INT_64 : UINT_64;
-        default: break;
-      }
-    case LogicalType::UNKNOWN: return NA;
-    case LogicalType::JSON: return JSON;
-    case LogicalType::BSON: return BSON;
-    default: break;
+  if (schema.converted_type.has_value()) {
+    switch (schema.converted_type.value()) {
+      case ENUM:  // treat ENUM as UTF8 string
+      case UTF8: return LogicalType{LogicalType::STRING};
+      case MAP: return LogicalType{LogicalType::MAP};
+      case LIST: return LogicalType{LogicalType::LIST};
+      case DECIMAL: return LogicalType{DecimalType{schema.decimal_scale, schema.decimal_precision}};
+      case DATE: return LogicalType{LogicalType::DATE};
+      case TIME_MILLIS: return LogicalType{TimeType{true, TimeUnit::MILLIS}};
+      case TIME_MICROS: return LogicalType{TimeType{true, TimeUnit::MICROS}};
+      case TIMESTAMP_MILLIS: return LogicalType{TimestampType{true, TimeUnit::MILLIS}};
+      case TIMESTAMP_MICROS: return LogicalType{TimestampType{true, TimeUnit::MICROS}};
+      case UINT_8: return LogicalType{IntType{8, false}};
+      case UINT_16: return LogicalType{IntType{16, false}};
+      case UINT_32: return LogicalType{IntType{32, false}};
+      case UINT_64: return LogicalType{IntType{64, false}};
+      case INT_8: return LogicalType{IntType{8, true}};
+      case INT_16: return LogicalType{IntType{16, true}};
+      case INT_32: return LogicalType{IntType{32, true}};
+      case INT_64: return LogicalType{IntType{64, true}};
+      case JSON: return LogicalType{LogicalType::JSON};
+      case BSON: return LogicalType{LogicalType::BSON};
+      case INTERVAL:  // there is no logical type for INTERVAL yet
+      default: return LogicalType{LogicalType::UNDEFINED};
+    }
   }
-  return UNKNOWN;
+  return thrust::nullopt;
 }
 
 }  // namespace
@@ -74,76 +66,90 @@ type_id to_type_id(SchemaElement const& schema,
                    bool strings_to_categorical,
                    type_id timestamp_type_id)
 {
-  auto const physical       = schema.type;
-  auto const logical_type   = schema.logical_type;
-  auto converted_type       = schema.converted_type;
-  int32_t decimal_precision = schema.decimal_precision;
-
-  // FIXME(ets): this should just use logical type to deduce the type_id. then fall back to
-  // converted_type if logical_type isn't set
-  // Logical type used for actual data interpretation; the legacy converted type
-  // is superseded by 'logical' type whenever available.
-  auto const inferred_converted_type = logical_type_to_converted_type(logical_type);
-  if (inferred_converted_type != UNKNOWN) { converted_type = inferred_converted_type; }
-  if (inferred_converted_type == DECIMAL) { decimal_precision = schema.logical_type->precision(); }
-
-  switch (converted_type.value_or(UNKNOWN)) {
-    case UINT_8: return type_id::UINT8;
-    case INT_8: return type_id::INT8;
-    case UINT_16: return type_id::UINT16;
-    case INT_16: return type_id::INT16;
-    case UINT_32: return type_id::UINT32;
-    case UINT_64: return type_id::UINT64;
-    case DATE: return type_id::TIMESTAMP_DAYS;
-    case TIME_MILLIS: return type_id::DURATION_MILLISECONDS;
-    case TIME_MICROS: return type_id::DURATION_MICROSECONDS;
-    case TIMESTAMP_MILLIS:
-      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
-                                                   : type_id::TIMESTAMP_MILLISECONDS;
-    case TIMESTAMP_MICROS:
-      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
-                                                   : type_id::TIMESTAMP_MICROSECONDS;
-    case DECIMAL:
-      if (physical == INT32) { return type_id::DECIMAL32; }
-      if (physical == INT64) { return type_id::DECIMAL64; }
-      if (physical == FIXED_LEN_BYTE_ARRAY) {
-        if (schema.type_length <= static_cast<int32_t>(sizeof(int32_t))) {
-          return type_id::DECIMAL32;
+  auto const physical = schema.type;
+  auto logical_type   = schema.logical_type;
+
+  // sanity check, but not worth failing over
+  if (schema.converted_type.has_value() and not logical_type.has_value()) {
+    CUDF_LOG_WARN("ConvertedType is specified but not LogicalType");
+    logical_type = converted_to_logical_type(schema);
+  }
+
+  if (logical_type.has_value()) {
+    switch (logical_type->type) {
+      case LogicalType::INTEGER: {
+        auto const is_signed = logical_type->is_signed();
+        switch (logical_type->bit_width()) {
+          case 8: return is_signed ? type_id::INT8 : type_id::UINT8;
+          case 16: return is_signed ? type_id::INT16 : type_id::UINT16;
+          case 32: return is_signed ? type_id::INT32 : type_id::UINT32;
+          case 64: return is_signed ? type_id::INT64 : type_id::UINT64;
+          default: CUDF_FAIL("Invalid integer bitwidth");
         }
-        if (schema.type_length <= static_cast<int32_t>(sizeof(int64_t))) {
-          return type_id::DECIMAL64;
+      } break;
+
+      case LogicalType::DATE: return type_id::TIMESTAMP_DAYS;
+
+      case LogicalType::TIME:
+        if (logical_type->is_time_millis()) {
+          return type_id::DURATION_MILLISECONDS;
+        } else if (logical_type->is_time_micros()) {
+          return type_id::DURATION_MICROSECONDS;
+        } else if (logical_type->is_time_nanos()) {
+          return type_id::DURATION_NANOSECONDS;
         }
-        if (schema.type_length <= static_cast<int32_t>(sizeof(__int128_t))) {
-          return type_id::DECIMAL128;
+        break;
+
+      case LogicalType::TIMESTAMP:
+        if (timestamp_type_id != type_id::EMPTY) {
+          return timestamp_type_id;
+        } else if (logical_type->is_timestamp_millis()) {
+          return type_id::TIMESTAMP_MILLISECONDS;
+        } else if (logical_type->is_timestamp_micros()) {
+          return type_id::TIMESTAMP_MICROSECONDS;
+        } else if (logical_type->is_timestamp_nanos()) {
+          return type_id::TIMESTAMP_NANOSECONDS;
         }
-      }
-      if (physical == BYTE_ARRAY) {
-        CUDF_EXPECTS(decimal_precision <= MAX_DECIMAL128_PRECISION, "Invalid decimal precision");
-        if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
+
+      case LogicalType::DECIMAL: {
+        int32_t const decimal_precision = logical_type->precision();
+        if (physical == INT32) {
           return type_id::DECIMAL32;
-        } else if (decimal_precision <= MAX_DECIMAL64_PRECISION) {
+        } else if (physical == INT64) {
           return type_id::DECIMAL64;
+        } else if (physical == FIXED_LEN_BYTE_ARRAY) {
+          if (schema.type_length <= static_cast<int32_t>(sizeof(int32_t))) {
+            return type_id::DECIMAL32;
+          } else if (schema.type_length <= static_cast<int32_t>(sizeof(int64_t))) {
+            return type_id::DECIMAL64;
+          } else if (schema.type_length <= static_cast<int32_t>(sizeof(__int128_t))) {
+            return type_id::DECIMAL128;
+          }
+        } else if (physical == BYTE_ARRAY) {
+          CUDF_EXPECTS(decimal_precision <= MAX_DECIMAL128_PRECISION, "Invalid decimal precision");
+          if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
+            return type_id::DECIMAL32;
+          } else if (decimal_precision <= MAX_DECIMAL64_PRECISION) {
+            return type_id::DECIMAL64;
+          } else {
+            return type_id::DECIMAL128;
+          }
         } else {
-          return type_id::DECIMAL128;
+          CUDF_FAIL("Invalid representation of decimal type");
         }
-      }
-      CUDF_FAIL("Invalid representation of decimal type");
-      break;
-
-    // maps are just List<Struct<>>.
-    case MAP:
-    case LIST: return type_id::LIST;
-    case NA: return type_id::STRING;
-    // return type_id::EMPTY; //TODO(kn): enable after Null/Empty column support
-    default: break;
-  }
+      } break;
 
-  if (inferred_converted_type == UNKNOWN and physical == INT64 and logical_type.has_value()) {
-    if (logical_type->is_timestamp_nanos()) {
-      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
-                                                   : type_id::TIMESTAMP_NANOSECONDS;
-    } else if (logical_type->is_time_nanos()) {
-      return type_id::DURATION_NANOSECONDS;
+      // maps are just List<Struct<>>.
+      case LogicalType::MAP:
+      case LogicalType::LIST: return type_id::LIST;
+
+      // All null column that can't have its type deduced.
+      // Note: originally LogicalType::UNKNOWN was converted to ConvertedType::NA, and
+      // NA then became type_id::STRING, but with the following TODO:
+      // return type_id::EMPTY; //TODO(kn): enable after Null/Empty column support
+      case LogicalType::UNKNOWN: return type_id::STRING;
+
+      default: break;
     }
   }
 
@@ -208,6 +214,7 @@ void metadata::sanitize_schema()
         // This is a list of structs, so we need to mark this as a list, but also
         // add a struct child and move this element's children to the struct
         schema_elem.converted_type  = LIST;
+        schema_elem.logical_type    = LogicalType::LIST;
         schema_elem.repetition_type = OPTIONAL;
         auto const struct_node_idx  = static_cast<size_type>(schema.size());
 
@@ -216,7 +223,7 @@ void metadata::sanitize_schema()
         struct_elem.repetition_type = REQUIRED;
         struct_elem.num_children    = schema_elem.num_children;
         struct_elem.type            = UNDEFINED_TYPE;
-        struct_elem.converted_type  = UNKNOWN;
+        struct_elem.converted_type  = thrust::nullopt;
 
         // swap children
         struct_elem.children_idx = std::move(schema_elem.children_idx);
@@ -238,6 +245,11 @@ void metadata::sanitize_schema()
       }
     }
 
+    // convert ConvertedType to LogicalType for older files
+    if (schema_elem.converted_type.has_value() and not schema_elem.logical_type.has_value()) {
+      schema_elem.logical_type = converted_to_logical_type(schema_elem);
+    }
+
     for (auto& child_idx : schema_elem.children_idx) {
       process(child_idx);
     }
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index e39445108a6..4b7a64ac6ab 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -643,7 +643,7 @@ struct set_str_dict_index_count {
   __device__ void operator()(PageInfo const& page)
   {
     auto const& chunk = chunks[page.chunk_idx];
-    if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) && (chunk.data_type & 0x7) == BYTE_ARRAY &&
+    if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) && chunk.physical_type == BYTE_ARRAY &&
         (chunk.num_dict_pages > 0)) {
       // there is only ever one dictionary page per chunk, so this is safe to do in parallel.
       str_dict_index_count[page.chunk_idx] = page.num_input_values;
@@ -659,7 +659,7 @@ struct set_str_dict_index_ptr {
   __device__ void operator()(size_t i)
   {
     auto& chunk = chunks[i];
-    if ((chunk.data_type & 0x7) == BYTE_ARRAY && (chunk.num_dict_pages > 0)) {
+    if (chunk.physical_type == BYTE_ARRAY && (chunk.num_dict_pages > 0)) {
       chunk.str_dict_index = base + str_dict_index_offsets[i];
     }
   }

From aab6137c80c50eccc5007120f7140cfe6646b5e0 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 1 Apr 2024 04:01:36 -0700
Subject: [PATCH 013/272] First pass at adding testing for pylibcudf (#15300)

This PR adds tests of the `pylibcudf.copying` module along with establishing the infrastructure and best practices for writing pylibcudf tests going forward (and adding associated documentation).

Resolves #15133

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Ashwin Srinath (https://github.com/shwina)
  - Jake Awe (https://github.com/AyodeAwe)
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/15300
---
 ci/test_python_cudf.sh                        |   8 +
 ci/test_wheel_cudf.sh                         |   8 +
 cpp/include/cudf/copying.hpp                  |   3 +
 cpp/src/copying/copy.cpp                      |   5 +-
 cpp/src/copying/copy_range.cu                 |   2 +-
 cpp/src/copying/scatter.cu                    |  11 +-
 docs/cudf/source/developer_guide/pylibcudf.md |  66 ++
 docs/cudf/source/developer_guide/testing.md   |   6 +
 python/cudf/cudf/_lib/cpp/copying.pxd         |  42 +-
 python/cudf/cudf/_lib/pylibcudf/column.pxd    |   1 +
 python/cudf/cudf/_lib/pylibcudf/column.pyx    |   9 +-
 python/cudf/cudf/_lib/pylibcudf/copying.pxd   |   6 +-
 python/cudf/cudf/_lib/pylibcudf/copying.pyx   | 126 ++-
 python/cudf/cudf/_lib/pylibcudf/interop.pyx   |   1 +
 python/cudf/cudf/_lib/pylibcudf/table.pxd     |   3 +
 python/cudf/cudf/_lib/pylibcudf/table.pyx     |   8 +
 python/cudf/cudf/_lib/pylibcudf/types.pyx     |   5 +
 .../cudf/cudf/pylibcudf_tests/common/utils.py | 111 +++
 python/cudf/cudf/pylibcudf_tests/conftest.py  |  31 +
 python/cudf/cudf/pylibcudf_tests/pytest.ini   |   8 +
 .../cudf/cudf/pylibcudf_tests/test_copying.py | 848 ++++++++++++++++++
 21 files changed, 1254 insertions(+), 54 deletions(-)
 create mode 100644 python/cudf/cudf/pylibcudf_tests/common/utils.py
 create mode 100644 python/cudf/cudf/pylibcudf_tests/conftest.py
 create mode 100644 python/cudf/cudf/pylibcudf_tests/pytest.ini
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_copying.py

diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh
index bacb54b3896..217dd2fd9a8 100755
--- a/ci/test_python_cudf.sh
+++ b/ci/test_python_cudf.sh
@@ -14,6 +14,14 @@ EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e
 
+rapids-logger "pytest pylibcudf"
+pushd python/cudf/cudf/pylibcudf_tests
+python -m pytest \
+  --cache-clear \
+  --dist=worksteal \
+  .
+popd
+
 rapids-logger "pytest cudf"
 ./ci/run_cudf_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \
diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index 83f0b976128..a6f122491b0 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -18,6 +18,14 @@ if [[ "$(arch)" == "aarch64" && ${RAPIDS_BUILD_TYPE} == "pull-request" ]]; then
     rapids-logger "Run smoke tests for cudf"
     python ./ci/wheel_smoke_test_cudf.py
 else
+    rapids-logger "pytest pylibcudf"
+    pushd python/cudf/cudf/pylibcudf_tests
+    python -m pytest \
+      --cache-clear \
+      --dist=worksteal \
+      .
+    popd
+
     rapids-logger "pytest cudf"
     pushd python/cudf/cudf/tests
     python -m pytest \
diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
index b2cde82fada..df96efdaffc 100644
--- a/cpp/include/cudf/copying.hpp
+++ b/cpp/include/cudf/copying.hpp
@@ -253,6 +253,8 @@ std::unique_ptr<column> empty_like(scalar const& input);
  * If the `mask_alloc` allocates a validity mask that mask is also uninitialized
  * and the validity bits and the null count should be set by the caller.
  *
+ * @throws cudf::data_type_error if input type is not of fixed width.
+ *
  * @param input Immutable view of input column to emulate
  * @param mask_alloc Optional, Policy for allocating null mask. Defaults to RETAIN
  * @param mr Device memory resource used to allocate the returned column's device memory
@@ -360,6 +362,7 @@ void copy_range_in_place(column_view const& source,
  *
  * @throws std::out_of_range for any invalid range.
  * @throws cudf::data_type_error if @p target and @p source have different types.
+ * @throws cudf::data_type_error if the data type is not fixed width, string, or dictionary
  *
  * @param source The column to copy from inside the range
  * @param target The column to copy from outside the range
diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp
index 490a1ccb254..cb7d507de81 100644
--- a/cpp/src/copying/copy.cpp
+++ b/cpp/src/copying/copy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -122,7 +122,8 @@ std::unique_ptr<column> allocate_like(column_view const& input,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  CUDF_EXPECTS(is_fixed_width(input.type()), "Expects only fixed-width type column");
+  CUDF_EXPECTS(
+    is_fixed_width(input.type()), "Expects only fixed-width type column", cudf::data_type_error);
   mask_state allocate_mask = should_allocate_mask(mask_alloc, input.nullable());
 
   return std::make_unique<column>(input.type(),
diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu
index 038646d8cf4..e10d7081a55 100644
--- a/cpp/src/copying/copy_range.cu
+++ b/cpp/src/copying/copy_range.cu
@@ -119,7 +119,7 @@ struct out_of_place_copy_range_dispatch {
   std::enable_if_t<not cudf::is_rep_layout_compatible<T>(), std::unique_ptr<cudf::column>>
   operator()(Args...)
   {
-    CUDF_FAIL("Unsupported type for out of place copy.");
+    CUDF_FAIL("Unsupported type for out of place copy.", cudf::data_type_error);
   }
 };
 
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 7931df4c9f0..3bc3979ec1b 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -144,7 +144,9 @@ struct column_scalar_scatterer_impl<string_view, MapIterator> {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
   {
-    CUDF_EXPECTS(source.get().type() == target.type(), "scalar and column types must match");
+    CUDF_EXPECTS(source.get().type() == target.type(),
+                 "scalar and column types must match",
+                 cudf::data_type_error);
 
     auto const scalar_impl = static_cast<string_scalar const*>(&source.get());
     auto const source_view = string_view(scalar_impl->data(), scalar_impl->size());
@@ -166,6 +168,9 @@ struct column_scalar_scatterer_impl<list_view, MapIterator> {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
   {
+    CUDF_EXPECTS(source.get().type() == target.type(),
+                 "scalar and column types must match",
+                 cudf::data_type_error);
     auto result =
       lists::detail::scatter(source, scatter_iter, scatter_iter + scatter_rows, target, stream, mr);
 
@@ -249,6 +254,10 @@ struct column_scalar_scatterer_impl<struct_view, MapIterator> {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
   {
+    CUDF_EXPECTS(source.get().type() == target.type(),
+                 "scalar and column types must match",
+                 cudf::data_type_error);
+
     // For each field of `source`, copy construct a scalar from the field
     // and dispatch to the corresponding scalar scatterer
 
diff --git a/docs/cudf/source/developer_guide/pylibcudf.md b/docs/cudf/source/developer_guide/pylibcudf.md
index 0120cbb286e..0b881b2b057 100644
--- a/docs/cudf/source/developer_guide/pylibcudf.md
+++ b/docs/cudf/source/developer_guide/pylibcudf.md
@@ -96,6 +96,72 @@ There are a couple of notable points from the snippet above:
 - The object returned from libcudf is immediately converted to a pylibcudf type.
 - `cudf::gather` accepts a `cudf::out_of_bounds_policy` enum parameter. `OutOfBoundsPolicy` is an alias for this type in pylibcudf that matches our Python naming conventions (CapsCase instead of snake\_case).
 
+## Testing
+
+When writing pylibcudf tests, it is important to remember that all the APIs should be tested in the C++ layer in libcudf already.
+The primary purpose of pylibcudf tests is to ensure the correctness of the _bindings_; the correctness of the underlying implementation should generally be validated in libcudf.
+If pylibcudf tests uncover a libcudf bug, a suitable libcudf test should be added to cover this case rather than relying solely on pylibcudf testing.
+
+pylibcudf's ``conftest.py`` contains some standard parametrized dtype fixture lists that may in turn be used to parametrize other fixtures.
+Fixtures allocating data should leverage these dtype lists wherever possible to simplify testing across the matrix of important types.
+Where appropriate, new fixture lists may be added.
+
+To run tests as efficiently as possible, the test suite should make generous use of fixtures.
+The simplest general structure to follow is for pyarrow array/table/scalar fixtures to be parametrized by one of the dtype list.
+Then, a corresponding pylibcudf fixture may be created using a simple `from_arrow` call.
+This approach ensures consistent global coverage across types for various tests.
+
+In general, pylibcudf tests should prefer validating against a corresponding pyarrow implementation rather than hardcoding data.
+This approach is more resilient to changes to input data, particularly given the fixture strategy outlined above.
+Standard tools for comparing between pylibcudf and pyarrow types are provided in the utils module.
+
+Here is an example demonstrating the above points:
+
+```python
+import pyarrow as pa
+import pyarrow.compute as pc
+import pytest
+from cudf._lib import pylibcudf as plc
+from utils import assert_column_eq
+
+# The pa_dtype fixture is defined in conftest.py.
+@pytest.fixture(scope="module")
+def pa_column(pa_dtype):
+    pa.array([1, 2, 3])
+
+
+@pytest.fixture(scope="module")
+def column(pa_column):
+    return plc.interop.from_arrow(pa_column)
+
+
+def test_foo(pa_column, column):
+    index = 1
+    result = plc.foo(column)
+    expected = pa.foo(pa_column)
+
+    assert_column_eq(result, expected)
+```
+
+Some guidelines on what should be tested:
+- Tests SHOULD comprehensively cover the API, including all possible combinations of arguments required to ensure good test coverage.
+- pylibcudf SHOULD NOT attempt to stress test large data sizes, and SHOULD instead defer to libcudf tests.
+  - Exception: In special cases where constructing suitable large tests is difficult in C++ (such as creating suitable input data for I/O testing), tests may be added to pylibcudf instead.
+- Nullable data should always be tested.
+- Expected exceptions should be tested. Tests should be written from the user's perspective in mind, and if the API is not currently throwing the appropriate exception it should be updated.
+  - Important note: If the exception should be produced by libcudf, the underlying libcudf API should be updated to throw the desired exception in C++. Such changes may require consultation with libcudf devs in nontrivial cases. [This issue](https://github.com/rapidsai/cudf/issues/12885) provides an overview and an indication of acceptable exception types that should cover most use cases. In rare cases a new C++ exception may need to be introduced in [`error.hpp`](https://github.com/rapidsai/cudf/blob/branch-24.04/cpp/include/cudf/utilities/error.hpp). If so, this exception will also need to be mapped to a suitable Python exception in [`exception_handler.pxd`](https://github.com/rapidsai/cudf/blob/branch-24.04/python/cudf/cudf/_lib/exception_handler.pxd).
+
+Some guidelines on how best to use pytests.
+- By default, fixtures producing device data containers should be of module scope and treated as immutable by tests. Allocating data on the GPU is expensive and slows tests. Almost all pylibcudf operations are out of place operations, so module-scoped fixtures should not typically be problematic to work with. Session-scoped fixtures would also work, but they are harder to reason about since they live in a different module, and if they need to change for any reason they could affect an arbitrarily large number of tests. Module scope is a good balance.
+- Where necessary, mutable fixtures should be named as such (e.g. `mutable_col`) and be of function scope. If possible, they can be implemented as simply making a copy of a corresponding module-scope immutable fixture to avoid duplicating the generation logic.
+
+Tests should be organized corresponding to pylibcudf modules, i.e. one test module for each pylibcudf module.
+
+The following sections of the cuDF Python testing guide also generally apply to pylibcudf unless superseded by any statements above:
+- [](#test_parametrization)
+- [](#xfailing_tests)
+- [](#testing_warnings)
+
 ## Miscellaneous Notes
 
 ### Cython Scoped Enums
diff --git a/docs/cudf/source/developer_guide/testing.md b/docs/cudf/source/developer_guide/testing.md
index a28a6b9192d..f12f809d5db 100644
--- a/docs/cudf/source/developer_guide/testing.md
+++ b/docs/cudf/source/developer_guide/testing.md
@@ -55,6 +55,8 @@ Typically, exception cases require specific assertions or other special logic, s
 The main exception to this rule is tests based on comparison to pandas.
 Such tests may test exceptional cases alongside more typical cases since the logic is generally identical.
 
+(test_parametrization)=
+
 ### Parametrization: custom fixtures and `pytest.mark.parametrize`
 
 When it comes to parametrizing tests written with `pytest`,
@@ -140,6 +142,8 @@ def test_odds():
 
 Other approaches are also possible, and the best solution should be discussed on a case-by-case basis during PR review.
 
+(xfailing_tests)=
+
 ### Tests with expected failures (`xfail`s)
 
 In some circumstances it makes sense to mark a test as _expected_ to
@@ -218,6 +222,8 @@ This way, when the bug is fixed, the test suite will fail at this
 point (and we will remember to update the test).
 
 
+(testing_warnings)=
+
 ### Testing code that throws warnings
 
 Some code may be expected to throw warnings.
diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/cpp/copying.pxd
index f3e5c0aec72..053e2299f22 100644
--- a/python/cudf/cudf/_lib/cpp/copying.pxd
+++ b/python/cudf/cudf/_lib/cpp/copying.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t, uint8_t
 from libcpp cimport bool
@@ -33,19 +33,19 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         const column_view& input,
         size_type offset,
         const scalar& fill_values
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[table] scatter (
         const table_view& source_table,
         const column_view& scatter_map,
         const table_view& target_table,
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[table] scatter (
         const vector[reference_wrapper[constscalar]]& source_scalars,
         const column_view& indices,
         const table_view& target,
-    ) except +
+    ) except +cudf_exception_handler
 
     cpdef enum class mask_allocation_policy(int32_t):
         NEVER
@@ -54,22 +54,22 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
 
     cdef unique_ptr[column] empty_like (
         const column_view& input_column
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] allocate_like (
         const column_view& input_column,
         mask_allocation_policy policy
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] allocate_like (
         const column_view& input_column,
         size_type size,
         mask_allocation_policy policy
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[table] empty_like (
         const table_view& input_table
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef void copy_range_in_place (
         const column_view& input_column,
@@ -77,7 +77,7 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         size_type input_begin,
         size_type input_end,
         size_type target_begin
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] copy_range (
         const column_view& input_column,
@@ -85,68 +85,68 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         size_type input_begin,
         size_type input_end,
         size_type target_begin
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef vector[column_view] slice (
         const column_view& input_column,
         vector[size_type] indices
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef vector[table_view] slice (
         const table_view& input_table,
         vector[size_type] indices
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef vector[column_view] split (
         const column_view& input_column,
         vector[size_type] splits
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef vector[table_view] split (
         const table_view& input_table,
         vector[size_type] splits
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const column_view& lhs,
         const column_view& rhs,
         const column_view& boolean_mask
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const scalar& lhs,
         const column_view& rhs,
         const column_view& boolean_mask
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const column_view& lhs,
         const scalar& rhs,
         const column_view boolean_mask
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const scalar& lhs,
         const scalar& rhs,
         const column_view boolean_mask
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[table] boolean_mask_scatter (
         const table_view& input,
         const table_view& target,
         const column_view& boolean_mask
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[table] boolean_mask_scatter (
         const vector[reference_wrapper[constscalar]]& input,
         const table_view& target,
         const column_view& boolean_mask
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[scalar] get_element (
         const column_view& input,
         size_type index
-    ) except +
+    ) except +cudf_exception_handler
 
     cpdef enum class sample_with_replacement(bool):
         FALSE
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd
index fc5cc77c9e7..66ccdb53d1a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pxd
@@ -43,6 +43,7 @@ cdef class Column:
     cpdef gpumemoryview data(self)
     cpdef gpumemoryview null_mask(self)
     cpdef list children(self)
+    cpdef Column copy(self)
 
     cpdef ListColumnView list_view(self)
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
index 3c5c53f99cf..2565e92d5c9 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -1,7 +1,7 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from cython.operator cimport dereference
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.utility cimport move
 
 from rmm._lib.device_buffer cimport DeviceBuffer
@@ -274,6 +274,13 @@ cdef class Column:
         """The children of the column."""
         return self._children
 
+    cpdef Column copy(self):
+        """Create a copy of the column."""
+        cdef unique_ptr[column] c_result
+        with nogil:
+            c_result = move(make_unique[column](self.view()))
+        return Column.from_libcudf(move(c_result))
+
 
 cdef class ListColumnView:
     """Accessor for methods of a Column that are specific to lists."""
diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pxd b/python/cudf/cudf/_lib/pylibcudf/copying.pxd
index 7b5f1e70ea3..0211d122c8e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/copying.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/copying.pxd
@@ -58,12 +58,12 @@ cpdef Column copy_range(
     size_type target_begin,
 )
 
-cpdef Column shift(Column input, size_type offset, Scalar fill_values)
-
-cpdef list split(ColumnOrTable input, list splits)
+cpdef Column shift(Column input, size_type offset, Scalar fill_value)
 
 cpdef list slice(ColumnOrTable input, list indices)
 
+cpdef list split(ColumnOrTable input, list splits)
+
 cpdef Column copy_if_else(
     LeftCopyIfElseOperand lhs,
     RightCopyIfElseOperand rhs,
diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pyx b/python/cudf/cudf/_lib/pylibcudf/copying.pyx
index d78955dc325..125a4ffe65f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/copying.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/copying.pyx
@@ -54,6 +54,11 @@ cpdef Table gather(
     -------
     pylibcudf.Table
         The result of the gather
+
+    Raises
+    ------
+    ValueError
+        If the gather_map contains nulls.
     """
     cdef unique_ptr[table] c_result
     with nogil:
@@ -92,6 +97,20 @@ cpdef Table scatter(
     -------
     Table
         The result of the scatter
+
+    Raises
+    ------
+    ValueError
+        If any of the following occur:
+            - scatter_map contains null values.
+            - source is a Table and the number of columns in source does not match the
+              number of columns in target.
+            - source is a Table and the number of rows in source does not match the
+              number of elements in scatter_map.
+            - source is a List[Scalar] and the number of scalars does not match the
+              number of columns in target.
+    TypeError
+        If data types of the source and target columns do not match.
     """
     cdef unique_ptr[table] c_result
     cdef vector[reference_wrapper[const scalar]] source_scalars
@@ -207,6 +226,17 @@ cpdef Column copy_range_in_place(
         The index of the last element in input_column to copy.
     target_begin : int
         The index of the first element in target_column to overwrite.
+
+    Raises
+    ------
+    TypeError
+        If the operation is attempted on non-fixed width types since those would require
+        memory reallocations, or if the input and target columns have different types.
+    IndexError
+        If the indices accessed by the ranges implied by input_begin, input_end, and
+        target_begin are out of bounds.
+    ValueError
+        If source has null values and target is not nullable.
     """
 
     # Need to initialize this outside the function call so that Cython doesn't
@@ -251,6 +281,14 @@ cpdef Column copy_range(
     -------
     pylibcudf.Column
         A copy of target_column with the specified range overwritten.
+
+    Raises
+    ------
+    IndexError
+        If the indices accessed by the ranges implied by input_begin, input_end, and
+        target_begin are out of bounds.
+    TypeError
+        If target and source have different types.
     """
     cdef unique_ptr[column] c_result
 
@@ -266,7 +304,7 @@ cpdef Column copy_range(
     return Column.from_libcudf(move(c_result))
 
 
-cpdef Column shift(Column input, size_type offset, Scalar fill_values):
+cpdef Column shift(Column input, size_type offset, Scalar fill_value):
     """Shift the elements of input by offset.
 
     For details on the implementation, see :cpp:func:`shift`.
@@ -285,6 +323,12 @@ cpdef Column shift(Column input, size_type offset, Scalar fill_values):
     -------
     pylibcudf.Column
         A copy of input shifted by offset.
+
+    Raises
+    ------
+    TypeError
+        If the fill_value is not of the same type as input, or if the input type is not
+        of fixed width or string type.
     """
     cdef unique_ptr[column] c_result
     with nogil:
@@ -292,37 +336,44 @@ cpdef Column shift(Column input, size_type offset, Scalar fill_values):
             cpp_copying.shift(
                 input.view(),
                 offset,
-                dereference(fill_values.c_obj)
+                dereference(fill_value.c_obj)
             )
         )
     return Column.from_libcudf(move(c_result))
 
 
-cpdef list split(ColumnOrTable input, list splits):
-    """Split input into multiple.
+cpdef list slice(ColumnOrTable input, list indices):
+    """Slice input according to indices.
 
-    For details on the implementation, see :cpp:func:`split`.
+    For details on the implementation, see :cpp:func:`slice`.
 
     Parameters
     ----------
-    input : Union[Column, Table]
-        The column to split.
-    splits : List[int]
-        The indices at which to split the column.
+    input_column : Union[Column, Table]
+        The column or table to slice.
+    indices : List[int]
+        The indices to select from input.
 
     Returns
     -------
     List[Union[Column, Table]]
-        The result of splitting input.
+        The result of slicing ``input``.
+
+    Raises
+    ------
+    ValueError
+        If indices size is not even or the values in any pair of lower/upper bounds are
+        strictly decreasing.
+    IndexError
+        When any of the indices don't belong to the range ``[0, input_column.size())``.
     """
-    cdef vector[size_type] c_splits = splits
+    cdef vector[size_type] c_indices = indices
     cdef vector[column_view] c_col_result
     cdef vector[table_view] c_tbl_result
     cdef int i
-
     if ColumnOrTable is Column:
         with nogil:
-            c_col_result = move(cpp_copying.split(input.view(), c_splits))
+            c_col_result = move(cpp_copying.slice(input.view(), c_indices))
 
         return [
             Column.from_column_view(c_col_result[i], input)
@@ -330,7 +381,7 @@ cpdef list split(ColumnOrTable input, list splits):
         ]
     else:
         with nogil:
-            c_tbl_result = move(cpp_copying.split(input.view(), c_splits))
+            c_tbl_result = move(cpp_copying.slice(input.view(), c_indices))
 
         return [
             Table.from_table_view(c_tbl_result[i], input)
@@ -338,30 +389,31 @@ cpdef list split(ColumnOrTable input, list splits):
         ]
 
 
-cpdef list slice(ColumnOrTable input, list indices):
-    """Slice input according to indices.
+cpdef list split(ColumnOrTable input, list splits):
+    """Split input into multiple.
 
-    For details on the implementation, see :cpp:func:`slice`.
+    For details on the implementation, see :cpp:func:`split`.
 
     Parameters
     ----------
-    input_column : Union[Column, Table]
-        The column or table to slice.
-    indices : List[int]
-        The indices to select from input.
+    input : Union[Column, Table]
+        The column to split.
+    splits : List[int]
+        The indices at which to split the column.
 
     Returns
     -------
     List[Union[Column, Table]]
-        The result of slicing ``input``.
+        The result of splitting input.
     """
-    cdef vector[size_type] c_indices = indices
+    cdef vector[size_type] c_splits = splits
     cdef vector[column_view] c_col_result
     cdef vector[table_view] c_tbl_result
     cdef int i
+
     if ColumnOrTable is Column:
         with nogil:
-            c_col_result = move(cpp_copying.slice(input.view(), c_indices))
+            c_col_result = move(cpp_copying.split(input.view(), c_splits))
 
         return [
             Column.from_column_view(c_col_result[i], input)
@@ -369,7 +421,7 @@ cpdef list slice(ColumnOrTable input, list indices):
         ]
     else:
         with nogil:
-            c_tbl_result = move(cpp_copying.slice(input.view(), c_indices))
+            c_tbl_result = move(cpp_copying.split(input.view(), c_splits))
 
         return [
             Table.from_table_view(c_tbl_result[i], input)
@@ -401,6 +453,15 @@ cpdef Column copy_if_else(
     -------
     pylibcudf.Column
         The result of copying elements from lhs and rhs according to boolean_mask.
+
+    Raises
+    ------
+    TypeError
+        If lhs and rhs are not of the same type or if the boolean mask is not of type
+        bool.
+    ValueError
+        If boolean mask is not of the same length as lhs and rhs (whichever are
+        columns), or if lhs and rhs are not of the same length (if both are columns).
     """
     cdef unique_ptr[column] result
 
@@ -459,6 +520,16 @@ cpdef Table boolean_mask_scatter(
     -------
     Table
         The result of the scatter
+
+    Raises
+    ------
+    ValueError
+        If input.num_columns() != target.num_columns(), boolean_mask.size() !=
+        target.num_rows(), or if input is a Table and the number of `true` in
+        `boolean_mask` > input.num_rows().
+    TypeError
+        If any input type does not match the corresponding target column's type, or
+        if boolean_mask.type() is not bool.
     """
     cdef unique_ptr[table] result
     cdef vector[reference_wrapper[const scalar]] source_scalars
@@ -502,6 +573,11 @@ cpdef Scalar get_element(Column input_column, size_type index):
     -------
     pylibcudf.Scalar
         The element at index from input_column.
+
+    Raises
+    ------
+    IndexError
+        If index is out of bounds.
     """
     cdef unique_ptr[scalar] c_output
     with nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
index e7471033fc8..8dc41fccc0c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
@@ -140,6 +140,7 @@ def _from_arrow_scalar(pyarrow_object, *, DataType data_type=None):
 
 
 @from_arrow.register(pa.Array)
+@from_arrow.register(pa.ChunkedArray)
 def _from_arrow_column(pyarrow_object, *, DataType data_type=None):
     if data_type is not None:
         raise ValueError("data_type may not be passed for arrays")
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pxd b/python/cudf/cudf/_lib/pylibcudf/table.pxd
index 327f3911489..7467bfccaa8 100644
--- a/python/cudf/cudf/_lib/pylibcudf/table.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/table.pxd
@@ -12,6 +12,9 @@ cdef class Table:
 
     cdef table_view view(self) nogil
 
+    cpdef int num_columns(self)
+    cpdef int num_rows(self)
+
     @staticmethod
     cdef Table from_libcudf(unique_ptr[table] libcudf_tbl)
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pyx b/python/cudf/cudf/_lib/pylibcudf/table.pyx
index 793e6330244..1fa60ec2b6c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/table.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/table.pyx
@@ -77,6 +77,14 @@ cdef class Table:
             for i in range(tv.num_columns())
         ])
 
+    cpdef int num_columns(self):
+        """The number of columns in this table."""
+        return len(self._columns)
+
+    cpdef int num_rows(self):
+        """The number of rows in this table."""
+        return self._columns[0].size()
+
     cpdef list columns(self):
         """The columns in this table."""
         return self._columns
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx
index f6ff6e5a2fc..d8b92283412 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx
@@ -39,6 +39,11 @@ cdef class DataType:
         """Get the scale associated with this data type."""
         return self.c_obj.scale()
 
+    def __eq__(self, other):
+        if not isinstance(other, DataType):
+            return False
+        return self.id() == other.id() and self.scale() == other.scale()
+
     @staticmethod
     cdef DataType from_libcudf(data_type dt):
         """Create a DataType from a libcudf data_type.
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
new file mode 100644
index 00000000000..6636ab9e5f8
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from typing import Optional
+
+import pyarrow as pa
+import pytest
+
+from cudf._lib import pylibcudf as plc
+
+
+def metadata_from_arrow_array(
+    pa_array: pa.Array,
+) -> Optional[plc.interop.ColumnMetadata]:
+    metadata = None
+    if pa.types.is_list(dtype := pa_array.type) or pa.types.is_struct(dtype):
+        metadata = plc.interop.ColumnMetadata(
+            "",
+            # libcudf does not store field names, so just match pyarrow's.
+            [
+                plc.interop.ColumnMetadata(pa_array.type.field(i).name)
+                for i in range(pa_array.type.num_fields)
+            ],
+        )
+    return metadata
+
+
+def assert_column_eq(plc_column: plc.Column, pa_array: pa.Array) -> None:
+    """Verify that the pylibcudf array and PyArrow array are equal."""
+    # Nested types require children metadata to be passed to the conversion function.
+    plc_pa = plc.interop.to_arrow(
+        plc_column, metadata=metadata_from_arrow_array(pa_array)
+    )
+
+    if isinstance(plc_pa, pa.ChunkedArray):
+        plc_pa = plc_pa.combine_chunks()
+    if isinstance(pa_array, pa.ChunkedArray):
+        pa_array = pa_array.combine_chunks()
+
+    assert plc_pa.equals(pa_array)
+
+
+def assert_table_eq(plc_table: plc.Table, pa_table: pa.Table) -> None:
+    """Verify that the pylibcudf array and PyArrow array are equal."""
+    plc_shape = (plc_table.num_rows(), plc_table.num_columns())
+    assert plc_shape == pa_table.shape
+
+    for plc_col, pa_col in zip(plc_table.columns(), pa_table.columns):
+        assert_column_eq(plc_col, pa_col)
+
+
+def cudf_raises(expected_exception: BaseException, *args, **kwargs):
+    # A simple wrapper around pytest.raises that defaults to looking for cudf exceptions
+    match = kwargs.get("match", None)
+    if match is None:
+        kwargs["match"] = "CUDF failure at"
+    return pytest.raises(expected_exception, *args, **kwargs)
+
+
+# TODO: Consider moving these type utilities into pylibcudf.types itself.
+def is_signed_integer(plc_dtype: plc.DataType):
+    return (
+        plc.TypeId.INT8.value <= plc_dtype.id().value <= plc.TypeId.INT64.value
+    )
+
+
+def is_unsigned_integer(plc_dtype: plc.DataType):
+    return plc_dtype.id() in (
+        plc.TypeId.UINT8,
+        plc.TypeId.UINT16,
+        plc.TypeId.UINT32,
+        plc.TypeId.UINT64,
+    )
+
+
+def is_integer(plc_dtype: plc.DataType):
+    return plc_dtype.id() in (
+        plc.TypeId.INT8,
+        plc.TypeId.INT16,
+        plc.TypeId.INT32,
+        plc.TypeId.INT64,
+    )
+
+
+def is_floating(plc_dtype: plc.DataType):
+    return plc_dtype.id() in (
+        plc.TypeId.FLOAT32,
+        plc.TypeId.FLOAT64,
+    )
+
+
+def is_boolean(plc_dtype: plc.DataType):
+    return plc_dtype.id() == plc.TypeId.BOOL8
+
+
+def is_string(plc_dtype: plc.DataType):
+    return plc_dtype.id() == plc.TypeId.STRING
+
+
+def is_fixed_width(plc_dtype: plc.DataType):
+    return (
+        is_integer(plc_dtype)
+        or is_floating(plc_dtype)
+        or is_boolean(plc_dtype)
+    )
+
+
+# We must explicitly specify this type via a field to ensure we don't include
+# nullability accidentally.
+DEFAULT_STRUCT_TESTING_TYPE = pa.struct(
+    [pa.field("v", pa.int64(), nullable=False)]
+)
diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py
new file mode 100644
index 00000000000..6d8284fb3db
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/conftest.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Tell ruff it's OK that some imports occur after the sys.path.insert
+# ruff: noqa: E402
+import os
+import sys
+
+import pyarrow as pa
+import pytest
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common"))
+
+from utils import DEFAULT_STRUCT_TESTING_TYPE
+
+
+# This fixture defines the standard set of types that all tests should default to
+# running on. If there is a need for some tests to run on a different set of types, that
+# type list fixture should also be defined below here if it is likely to be reused
+# across modules. Otherwise it may be defined on a per-module basis.
+@pytest.fixture(
+    scope="session",
+    params=[
+        pa.int64(),
+        pa.float64(),
+        pa.string(),
+        pa.bool_(),
+        pa.list_(pa.int64()),
+        DEFAULT_STRUCT_TESTING_TYPE,
+    ],
+)
+def pa_type(request):
+    return request.param
diff --git a/python/cudf/cudf/pylibcudf_tests/pytest.ini b/python/cudf/cudf/pylibcudf_tests/pytest.ini
new file mode 100644
index 00000000000..1761c0f011c
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/pytest.ini
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+xfail_strict = true
+filterwarnings =
+    error
+    ignore:::.*xdist.*
+    ignore:::.*pytest.*
diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py
new file mode 100644
index 00000000000..0bf30f98636
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py
@@ -0,0 +1,848 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pytest
+from utils import (
+    DEFAULT_STRUCT_TESTING_TYPE,
+    assert_column_eq,
+    assert_table_eq,
+    cudf_raises,
+    is_fixed_width,
+    is_floating,
+    is_integer,
+    is_string,
+    metadata_from_arrow_array,
+)
+
+from cudf._lib import pylibcudf as plc
+
+
+# TODO: Test nullable data
+@pytest.fixture(scope="module")
+def pa_input_column(pa_type):
+    if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type):
+        return pa.array([1, 2, 3], type=pa_type)
+    elif pa.types.is_string(pa_type):
+        return pa.array(["a", "b", "c"], type=pa_type)
+    elif pa.types.is_boolean(pa_type):
+        return pa.array([True, True, False], type=pa_type)
+    elif pa.types.is_list(pa_type):
+        # TODO: Add heterogenous sizes
+        return pa.array([[1], [2], [3]], type=pa_type)
+    elif pa.types.is_struct(pa_type):
+        return pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type)
+    raise ValueError("Unsupported type")
+
+
+@pytest.fixture(scope="module")
+def input_column(pa_input_column):
+    return plc.interop.from_arrow(pa_input_column)
+
+
+@pytest.fixture(scope="module")
+def pa_index_column():
+    # Index column for testing gather/scatter, always integral.
+    return pa.array([1, 2, 3])
+
+
+@pytest.fixture(scope="module")
+def index_column(pa_index_column):
+    return plc.interop.from_arrow(pa_index_column)
+
+
+@pytest.fixture(scope="module")
+def pa_target_column(pa_type):
+    if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type):
+        return pa.array([4, 5, 6, 7, 8, 9], type=pa_type)
+    elif pa.types.is_string(pa_type):
+        return pa.array(["d", "e", "f", "g", "h", "i"], type=pa_type)
+    elif pa.types.is_boolean(pa_type):
+        return pa.array([False, True, True, False, True, False], type=pa_type)
+    elif pa.types.is_list(pa_type):
+        # TODO: Add heterogenous sizes
+        return pa.array([[4], [5], [6], [7], [8], [9]], type=pa_type)
+    elif pa.types.is_struct(pa_type):
+        return pa.array(
+            [{"v": 4}, {"v": 5}, {"v": 6}, {"v": 7}, {"v": 8}, {"v": 9}],
+            type=pa_type,
+        )
+    raise ValueError("Unsupported type")
+
+
+@pytest.fixture(scope="module")
+def target_column(pa_target_column):
+    return plc.interop.from_arrow(pa_target_column)
+
+
+@pytest.fixture
+def mutable_target_column(target_column):
+    return target_column.copy()
+
+
+@pytest.fixture(scope="module")
+def pa_source_table(pa_input_column):
+    return pa.table([pa_input_column] * 3, [""] * 3)
+
+
+@pytest.fixture(scope="module")
+def source_table(pa_source_table):
+    return plc.interop.from_arrow(pa_source_table)
+
+
+@pytest.fixture(scope="module")
+def pa_target_table(pa_target_column):
+    return pa.table([pa_target_column] * 3, [""] * 3)
+
+
+@pytest.fixture(scope="module")
+def target_table(pa_target_table):
+    return plc.interop.from_arrow(pa_target_table)
+
+
+@pytest.fixture(scope="module")
+def pa_source_scalar(pa_type):
+    if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type):
+        return pa.scalar(1, type=pa_type)
+    elif pa.types.is_string(pa_type):
+        return pa.scalar("a", type=pa_type)
+    elif pa.types.is_boolean(pa_type):
+        return pa.scalar(False, type=pa_type)
+    elif pa.types.is_list(pa_type):
+        # TODO: Longer list?
+        return pa.scalar([1], type=pa_type)
+    elif pa.types.is_struct(pa_type):
+        return pa.scalar({"v": 1}, type=pa_type)
+    raise ValueError("Unsupported type")
+
+
+@pytest.fixture(scope="module")
+def source_scalar(pa_source_scalar):
+    return plc.interop.from_arrow(pa_source_scalar)
+
+
+@pytest.fixture(scope="module")
+def pa_mask(pa_target_column):
+    return pa.array([True, False] * (len(pa_target_column) // 2))
+
+
+@pytest.fixture(scope="module")
+def mask(pa_mask):
+    return plc.interop.from_arrow(pa_mask)
+
+
+def test_gather(target_table, pa_target_table, index_column, pa_index_column):
+    result = plc.copying.gather(
+        target_table,
+        index_column,
+        plc.copying.OutOfBoundsPolicy.DONT_CHECK,
+    )
+    expected = pa_target_table.take(pa_index_column)
+    assert_table_eq(result, expected)
+
+
+def test_gather_map_has_nulls(target_table):
+    gather_map = plc.interop.from_arrow(pa.array([0, 1, None]))
+    with cudf_raises(ValueError):
+        plc.copying.gather(
+            target_table,
+            gather_map,
+            plc.copying.OutOfBoundsPolicy.DONT_CHECK,
+        )
+
+
+def _pyarrow_index_to_mask(indices, mask_size):
+    # Convert a list of indices to a boolean mask.
+    return pc.is_in(pa.array(range(mask_size)), pa.array(indices))
+
+
+def _pyarrow_boolean_mask_scatter_column(source, mask, target):
+    if isinstance(source, pa.Scalar):
+        # if_else requires array lengths to match exactly or the replacement must be a
+        # scalar, so we use this in the scalar case.
+        return pc.if_else(mask, target, source)
+
+    if isinstance(source, pa.ChunkedArray):
+        source = source.combine_chunks()
+    if isinstance(target, pa.ChunkedArray):
+        target = target.combine_chunks()
+
+    # replace_with_mask accepts a column whose size is the number of true values in
+    # the mask, so we can use it for columnar scatters.
+    return pc.replace_with_mask(target, mask, source)
+
+
+def _pyarrow_boolean_mask_scatter_table(source, mask, target_table):
+    # pyarrow equivalent of cudf's boolean_mask_scatter.
+    return pa.table(
+        [
+            _pyarrow_boolean_mask_scatter_column(r, mask, v)
+            for v, r in zip(target_table, source)
+        ],
+        [""] * target_table.num_columns,
+    )
+
+
+def test_scatter_table(
+    source_table,
+    pa_source_table,
+    index_column,
+    pa_index_column,
+    target_table,
+    pa_target_table,
+):
+    result = plc.copying.scatter(
+        source_table,
+        index_column,
+        target_table,
+    )
+
+    if pa.types.is_list(
+        dtype := pa_target_table[0].type
+    ) or pa.types.is_struct(dtype):
+        # pyarrow does not support scattering with list data. If and when they do,
+        # replace this hardcoding with their implementation.
+        with pytest.raises(pa.ArrowNotImplementedError):
+            _pyarrow_boolean_mask_scatter_table(
+                pa_source_table,
+                _pyarrow_index_to_mask(
+                    pa_index_column, pa_target_table.num_rows
+                ),
+                pa_target_table,
+            )
+
+        if pa.types.is_list(dtype := pa_target_table[0].type):
+            expected = pa.table(
+                [pa.array([[4], [1], [2], [3], [8], [9]])] * 3, [""] * 3
+            )
+        elif pa.types.is_struct(dtype):
+            expected = pa.table(
+                [
+                    pa.array(
+                        [
+                            {"v": 4},
+                            {"v": 1},
+                            {"v": 2},
+                            {"v": 3},
+                            {"v": 8},
+                            {"v": 9},
+                        ],
+                        type=DEFAULT_STRUCT_TESTING_TYPE,
+                    )
+                ]
+                * 3,
+                [""] * 3,
+            )
+    else:
+        expected = _pyarrow_boolean_mask_scatter_table(
+            pa_source_table,
+            _pyarrow_index_to_mask(pa_index_column, pa_target_table.num_rows),
+            pa_target_table,
+        )
+
+    assert_table_eq(result, expected)
+
+
+def test_scatter_table_num_col_mismatch(
+    source_table, index_column, target_table
+):
+    # Number of columns in source and target must match.
+    with cudf_raises(ValueError):
+        plc.copying.scatter(
+            plc.Table(source_table.columns()[:2]),
+            index_column,
+            target_table,
+        )
+
+
+def test_scatter_table_num_row_mismatch(source_table, target_table):
+    # Number of rows in source and scatter map must match.
+    with cudf_raises(ValueError):
+        plc.copying.scatter(
+            source_table,
+            plc.interop.from_arrow(
+                pa.array(range(source_table.num_rows() * 2))
+            ),
+            target_table,
+        )
+
+
+def test_scatter_table_map_has_nulls(source_table, target_table):
+    with cudf_raises(ValueError):
+        plc.copying.scatter(
+            source_table,
+            plc.interop.from_arrow(pa.array([None] * source_table.num_rows())),
+            target_table,
+        )
+
+
+def test_scatter_table_type_mismatch(source_table, index_column, target_table):
+    with cudf_raises(TypeError):
+        if is_integer(
+            dtype := target_table.columns()[0].type()
+        ) or is_floating(dtype):
+            pa_array = pa.array([True] * source_table.num_rows())
+        else:
+            pa_array = pa.array([1] * source_table.num_rows())
+        ncol = source_table.num_columns()
+        pa_table = pa.table([pa_array] * ncol, [""] * ncol)
+        plc.copying.scatter(
+            plc.interop.from_arrow(pa_table),
+            index_column,
+            target_table,
+        )
+
+
+def test_scatter_scalars(
+    source_scalar,
+    pa_source_scalar,
+    index_column,
+    pa_index_column,
+    target_table,
+    pa_target_table,
+):
+    result = plc.copying.scatter(
+        [source_scalar] * target_table.num_columns(),
+        index_column,
+        target_table,
+    )
+
+    expected = _pyarrow_boolean_mask_scatter_table(
+        [pa_source_scalar] * target_table.num_columns(),
+        pc.invert(
+            _pyarrow_index_to_mask(pa_index_column, pa_target_table.num_rows)
+        ),
+        pa_target_table,
+    )
+
+    assert_table_eq(result, expected)
+
+
+def test_scatter_scalars_num_scalars_mismatch(
+    source_scalar, index_column, target_table
+):
+    with cudf_raises(ValueError):
+        plc.copying.scatter(
+            [source_scalar] * (target_table.num_columns() - 1),
+            index_column,
+            target_table,
+        )
+
+
+def test_scatter_scalars_map_has_nulls(source_scalar, target_table):
+    with cudf_raises(ValueError):
+        plc.copying.scatter(
+            [source_scalar] * target_table.num_columns(),
+            plc.interop.from_arrow(pa.array([None, None])),
+            target_table,
+        )
+
+
+def test_scatter_scalars_type_mismatch(index_column, target_table):
+    with cudf_raises(TypeError):
+        if is_integer(
+            dtype := target_table.columns()[0].type()
+        ) or is_floating(dtype):
+            source_scalar = [plc.interop.from_arrow(pa.scalar(True))]
+        else:
+            source_scalar = [plc.interop.from_arrow(pa.scalar(1))]
+        plc.copying.scatter(
+            source_scalar * target_table.num_columns(),
+            index_column,
+            target_table,
+        )
+
+
+def test_empty_like_column(input_column):
+    result = plc.copying.empty_like(input_column)
+    assert result.type() == input_column.type()
+
+
+def test_empty_like_table(source_table):
+    result = plc.copying.empty_like(source_table)
+    assert result.num_columns() == source_table.num_columns()
+    for icol, rcol in zip(source_table.columns(), result.columns()):
+        assert rcol.type() == icol.type()
+
+
+@pytest.mark.parametrize("size", [None, 10])
+def test_allocate_like(input_column, size):
+    if is_fixed_width(input_column.type()):
+        result = plc.copying.allocate_like(
+            input_column, plc.copying.MaskAllocationPolicy.RETAIN, size=size
+        )
+        assert result.type() == input_column.type()
+        assert result.size() == (input_column.size() if size is None else size)
+    else:
+        with pytest.raises(TypeError):
+            plc.copying.allocate_like(
+                input_column,
+                plc.copying.MaskAllocationPolicy.RETAIN,
+                size=size,
+            )
+
+
+def test_copy_range_in_place(
+    input_column, pa_input_column, mutable_target_column, pa_target_column
+):
+    if not is_fixed_width(mutable_target_column.type()):
+        with pytest.raises(TypeError):
+            plc.copying.copy_range_in_place(
+                input_column,
+                mutable_target_column,
+                0,
+                input_column.size(),
+                0,
+            )
+    else:
+        plc.copying.copy_range_in_place(
+            input_column,
+            mutable_target_column,
+            0,
+            input_column.size(),
+            0,
+        )
+        expected = _pyarrow_boolean_mask_scatter_column(
+            pa_input_column,
+            _pyarrow_index_to_mask(
+                range(len(pa_input_column)), len(pa_target_column)
+            ),
+            pa_target_column,
+        )
+        assert_column_eq(mutable_target_column, expected)
+
+
+def test_copy_range_in_place_out_of_bounds(
+    input_column, mutable_target_column
+):
+    if is_fixed_width(mutable_target_column.type()):
+        with cudf_raises(IndexError):
+            plc.copying.copy_range_in_place(
+                input_column,
+                mutable_target_column,
+                5,
+                5 + input_column.size(),
+                0,
+            )
+
+
+def test_copy_range_in_place_different_types(mutable_target_column):
+    if is_integer(dtype := mutable_target_column.type()) or is_floating(dtype):
+        input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
+    else:
+        input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
+
+    with cudf_raises(TypeError):
+        plc.copying.copy_range_in_place(
+            input_column,
+            mutable_target_column,
+            0,
+            input_column.size(),
+            0,
+        )
+
+
+def test_copy_range_in_place_null_mismatch(
+    pa_input_column, mutable_target_column
+):
+    if is_fixed_width(mutable_target_column.type()):
+        pa_input_column = pc.if_else(
+            _pyarrow_index_to_mask([0], len(pa_input_column)),
+            pa_input_column,
+            pa.scalar(None, type=pa_input_column.type),
+        )
+        input_column = plc.interop.from_arrow(pa_input_column)
+        with cudf_raises(ValueError):
+            plc.copying.copy_range_in_place(
+                input_column,
+                mutable_target_column,
+                0,
+                input_column.size(),
+                0,
+            )
+
+
+def test_copy_range(
+    input_column, pa_input_column, target_column, pa_target_column
+):
+    if is_fixed_width(dtype := target_column.type()) or is_string(dtype):
+        result = plc.copying.copy_range(
+            input_column,
+            target_column,
+            0,
+            input_column.size(),
+            0,
+        )
+        expected = _pyarrow_boolean_mask_scatter_column(
+            pa_input_column,
+            _pyarrow_index_to_mask(
+                range(len(pa_input_column)), len(pa_target_column)
+            ),
+            pa_target_column,
+        )
+        assert_column_eq(result, expected)
+    else:
+        with pytest.raises(TypeError):
+            plc.copying.copy_range(
+                input_column,
+                target_column,
+                0,
+                input_column.size(),
+                0,
+            )
+
+
+def test_copy_range_out_of_bounds(input_column, target_column):
+    with cudf_raises(IndexError):
+        plc.copying.copy_range(
+            input_column,
+            target_column,
+            5,
+            5 + input_column.size(),
+            0,
+        )
+
+
+def test_copy_range_different_types(target_column):
+    if is_integer(dtype := target_column.type()) or is_floating(dtype):
+        input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
+    else:
+        input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
+
+    with cudf_raises(TypeError):
+        plc.copying.copy_range(
+            input_column,
+            target_column,
+            0,
+            input_column.size(),
+            0,
+        )
+
+
+def test_shift(
+    target_column, pa_target_column, source_scalar, pa_source_scalar
+):
+    shift = 2
+    if is_fixed_width(dtype := target_column.type()) or is_string(dtype):
+        result = plc.copying.shift(target_column, shift, source_scalar)
+        expected = pa.concat_arrays(
+            [pa.array([pa_source_scalar] * shift), pa_target_column[:-shift]]
+        )
+        assert_column_eq(result, expected)
+    else:
+        with pytest.raises(TypeError):
+            plc.copying.shift(target_column, shift, source_scalar)
+
+
+def test_shift_type_mismatch(target_column):
+    if is_integer(dtype := target_column.type()) or is_floating(dtype):
+        fill_value = plc.interop.from_arrow(pa.scalar("a"))
+    else:
+        fill_value = plc.interop.from_arrow(pa.scalar(1))
+
+    with cudf_raises(TypeError):
+        plc.copying.shift(target_column, 2, fill_value)
+
+
+def test_slice_column(target_column, pa_target_column):
+    bounds = list(range(6))
+    upper_bounds = bounds[1::2]
+    lower_bounds = bounds[::2]
+    result = plc.copying.slice(target_column, bounds)
+    for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result):
+        assert_column_eq(slice_, pa_target_column[lb:ub])
+
+
+def test_slice_column_wrong_length(target_column):
+    with cudf_raises(ValueError):
+        plc.copying.slice(target_column, list(range(5)))
+
+
+def test_slice_column_decreasing(target_column):
+    with cudf_raises(ValueError):
+        plc.copying.slice(target_column, list(range(5, -1, -1)))
+
+
+def test_slice_column_out_of_bounds(target_column):
+    with cudf_raises(IndexError):
+        plc.copying.slice(target_column, list(range(2, 8)))
+
+
+def test_slice_table(target_table, pa_target_table):
+    bounds = list(range(6))
+    upper_bounds = bounds[1::2]
+    lower_bounds = bounds[::2]
+    result = plc.copying.slice(target_table, bounds)
+    for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result):
+        assert_table_eq(slice_, pa_target_table[lb:ub])
+
+
+def test_split_column(target_column, pa_target_column):
+    upper_bounds = [1, 3, 5]
+    lower_bounds = [0] + upper_bounds[:-1]
+    result = plc.copying.split(target_column, upper_bounds)
+    for lb, ub, split in zip(lower_bounds, upper_bounds, result):
+        assert_column_eq(split, pa_target_column[lb:ub])
+
+
+def test_split_column_decreasing(target_column):
+    with cudf_raises(ValueError):
+        plc.copying.split(target_column, list(range(5, -1, -1)))
+
+
+def test_split_column_out_of_bounds(target_column):
+    with cudf_raises(IndexError):
+        plc.copying.split(target_column, list(range(5, 8)))
+
+
+def test_split_table(target_table, pa_target_table):
+    upper_bounds = [1, 3, 5]
+    lower_bounds = [0] + upper_bounds[:-1]
+    result = plc.copying.split(target_table, upper_bounds)
+    for lb, ub, split in zip(lower_bounds, upper_bounds, result):
+        assert_table_eq(split, pa_target_table[lb:ub])
+
+
+def test_copy_if_else_column_column(
+    target_column, pa_target_column, pa_source_scalar, mask, pa_mask
+):
+    pa_other_column = pa.concat_arrays(
+        [pa.array([pa_source_scalar] * 2), pa_target_column[:-2]]
+    )
+    other_column = plc.interop.from_arrow(pa_other_column)
+
+    result = plc.copying.copy_if_else(
+        target_column,
+        other_column,
+        mask,
+    )
+
+    expected = pc.if_else(
+        pa_mask,
+        pa_target_column,
+        pa_other_column,
+    )
+    assert_column_eq(result, expected)
+
+
+def test_copy_if_else_wrong_type(target_column, mask):
+    if is_integer(dtype := target_column.type()) or is_floating(dtype):
+        input_column = plc.interop.from_arrow(
+            pa.array(["a"] * target_column.size())
+        )
+    else:
+        input_column = plc.interop.from_arrow(
+            pa.array([1] * target_column.size())
+        )
+
+    with cudf_raises(TypeError):
+        plc.copying.copy_if_else(input_column, target_column, mask)
+
+
+def test_copy_if_else_wrong_type_mask(target_column):
+    with cudf_raises(TypeError):
+        plc.copying.copy_if_else(
+            target_column,
+            target_column,
+            plc.interop.from_arrow(
+                pa.array([1.0, 2.0] * (target_column.size() // 2))
+            ),
+        )
+
+
+def test_copy_if_else_wrong_size(target_column):
+    with cudf_raises(ValueError):
+        plc.copying.copy_if_else(
+            plc.interop.from_arrow(pa.array([1])),
+            target_column,
+            plc.interop.from_arrow(
+                pa.array([True, False] * (target_column.size() // 2))
+            ),
+        )
+
+
+def test_copy_if_else_wrong_size_mask(target_column):
+    with cudf_raises(ValueError):
+        plc.copying.copy_if_else(
+            target_column,
+            target_column,
+            plc.interop.from_arrow(pa.array([True])),
+        )
+
+
+@pytest.mark.parametrize("array_left", [True, False])
+def test_copy_if_else_column_scalar(
+    target_column,
+    pa_target_column,
+    source_scalar,
+    pa_source_scalar,
+    array_left,
+    mask,
+    pa_mask,
+):
+    args = (
+        (target_column, source_scalar)
+        if array_left
+        else (source_scalar, target_column)
+    )
+    result = plc.copying.copy_if_else(
+        *args,
+        mask,
+    )
+
+    pa_args = (
+        (pa_target_column, pa_source_scalar)
+        if array_left
+        else (pa_source_scalar, pa_target_column)
+    )
+    expected = pc.if_else(
+        pa_mask,
+        *pa_args,
+    )
+    assert_column_eq(result, expected)
+
+
+def test_boolean_mask_scatter_from_table(
+    source_table,
+    pa_source_table,
+    target_table,
+    pa_target_table,
+    mask,
+    pa_mask,
+):
+    result = plc.copying.boolean_mask_scatter(
+        source_table,
+        target_table,
+        mask,
+    )
+
+    if pa.types.is_list(
+        dtype := pa_target_table[0].type
+    ) or pa.types.is_struct(dtype):
+        # pyarrow does not support scattering with list data. If and when they do,
+        # replace this hardcoding with their implementation.
+        with pytest.raises(pa.ArrowNotImplementedError):
+            _pyarrow_boolean_mask_scatter_table(
+                pa_source_table, pa_mask, pa_target_table
+            )
+
+        if pa.types.is_list(dtype := pa_target_table[0].type):
+            expected = pa.table(
+                [pa.array([[1], [5], [2], [7], [3], [9]])] * 3, [""] * 3
+            )
+        elif pa.types.is_struct(dtype):
+            expected = pa.table(
+                [
+                    pa.array(
+                        [
+                            {"v": 1},
+                            {"v": 5},
+                            {"v": 2},
+                            {"v": 7},
+                            {"v": 3},
+                            {"v": 9},
+                        ],
+                        type=DEFAULT_STRUCT_TESTING_TYPE,
+                    )
+                ]
+                * 3,
+                [""] * 3,
+            )
+    else:
+        expected = _pyarrow_boolean_mask_scatter_table(
+            pa_source_table, pa_mask, pa_target_table
+        )
+
+    assert_table_eq(result, expected)
+
+
+def test_boolean_mask_scatter_from_wrong_num_cols(source_table, target_table):
+    with cudf_raises(ValueError):
+        plc.copying.boolean_mask_scatter(
+            plc.Table(source_table.columns()[:2]),
+            target_table,
+            plc.interop.from_arrow(pa.array([True, False] * 3)),
+        )
+
+
+def test_boolean_mask_scatter_from_wrong_mask_size(source_table, target_table):
+    with cudf_raises(ValueError):
+        plc.copying.boolean_mask_scatter(
+            source_table,
+            target_table,
+            plc.interop.from_arrow(pa.array([True, False] * 2)),
+        )
+
+
+def test_boolean_mask_scatter_from_wrong_num_true(source_table, target_table):
+    with cudf_raises(ValueError):
+        plc.copying.boolean_mask_scatter(
+            plc.Table(source_table.columns()[:2]),
+            target_table,
+            plc.interop.from_arrow(
+                pa.array([True, False] * 2 + [False, False])
+            ),
+        )
+
+
+def test_boolean_mask_scatter_from_wrong_col_type(target_table, mask):
+    if is_integer(dtype := target_table.columns()[0].type()) or is_floating(
+        dtype
+    ):
+        input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
+    else:
+        input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
+
+    with cudf_raises(TypeError):
+        plc.copying.boolean_mask_scatter(
+            plc.Table([input_column] * 3), target_table, mask
+        )
+
+
+def test_boolean_mask_scatter_from_wrong_mask_type(source_table, target_table):
+    with cudf_raises(TypeError):
+        plc.copying.boolean_mask_scatter(
+            source_table,
+            target_table,
+            plc.interop.from_arrow(pa.array([1.0, 2.0] * 3)),
+        )
+
+
+def test_boolean_mask_scatter_from_scalars(
+    source_scalar,
+    pa_source_scalar,
+    target_table,
+    pa_target_table,
+    mask,
+    pa_mask,
+):
+    result = plc.copying.boolean_mask_scatter(
+        [source_scalar] * 3,
+        target_table,
+        mask,
+    )
+
+    expected = _pyarrow_boolean_mask_scatter_table(
+        [pa_source_scalar] * target_table.num_columns(),
+        pc.invert(pa_mask),
+        pa_target_table,
+    )
+
+    assert_table_eq(result, expected)
+
+
+def test_get_element(input_column, pa_input_column):
+    index = 1
+    result = plc.copying.get_element(input_column, index)
+
+    assert (
+        plc.interop.to_arrow(
+            result, metadata_from_arrow_array(pa_input_column)
+        ).as_py()
+        == pa_input_column[index].as_py()
+    )
+
+
+def test_get_element_out_of_bounds(input_column):
+    with cudf_raises(IndexError):
+        plc.copying.get_element(input_column, 100)

From 0a8807eb2f8f87cbfdc49538b73ff498526adf66 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 1 Apr 2024 14:31:16 -0700
Subject: [PATCH 014/272] Apply the cuFile error work around to data_sink as
 well (#15335)

Issue #14140

Follow-up on https://github.com/rapidsai/cudf/pull/15293

Moving the `cudaFree(0)` call to a function called both by file `datasource` and `data_sink`.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15335
---
 cpp/src/io/utilities/data_sink.cpp         | 1 +
 cpp/src/io/utilities/datasource.cpp        | 6 +-----
 cpp/src/io/utilities/file_io_utilities.cpp | 8 ++++++++
 cpp/src/io/utilities/file_io_utilities.hpp | 3 +++
 4 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index 5557648ebbe..66905c5256f 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -36,6 +36,7 @@ class file_sink : public data_sink {
  public:
   explicit file_sink(std::string const& filepath)
   {
+    detail::force_init_cuda_context();
     _output_stream.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc);
     if (!_output_stream.is_open()) { detail::throw_on_file_open_failure(filepath, true); }
 
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 54e7c6bf1d6..d8dbd3614c8 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -43,12 +43,8 @@ class file_source : public datasource {
  public:
   explicit file_source(char const* filepath) : _file(filepath, O_RDONLY)
   {
+    detail::force_init_cuda_context();
     if (detail::cufile_integration::is_kvikio_enabled()) {
-      // Workaround for https://github.com/rapidsai/cudf/issues/14140, where cuFileDriverOpen errors
-      // out if no CUDA calls have been made before it. This is a no-op if the CUDA context is
-      // already initialized
-      cudaFree(0);
-
       _kvikio_file = kvikio::FileHandle(filepath);
       CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.",
                     _kvikio_file.is_compat_mode_on() ? "on" : "off");
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index 01090a43a0e..39031526fc8 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -34,6 +34,14 @@ namespace cudf {
 namespace io {
 namespace detail {
 
+void force_init_cuda_context()
+{
+  // Workaround for https://github.com/rapidsai/cudf/issues/14140, where cuFileDriverOpen errors
+  // out if no CUDA calls have been made before it. This is a no-op if the CUDA context is already
+  // initialized.
+  cudaFree(0);
+}
+
 [[noreturn]] void throw_on_file_open_failure(std::string const& filepath, bool is_create)
 {
   // save errno because it may be overwritten by subsequent calls
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index 0d5a5b218da..74a2ae53961 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -37,6 +37,9 @@ namespace detail {
 
 [[noreturn]] void throw_on_file_open_failure(std::string const& filepath, bool is_create);
 
+// Call before any cuFile API calls to ensure the CUDA context is initialized.
+void force_init_cuda_context();
+
 /**
  * @brief Class that provides RAII for file handling.
  */

From e5f9e2d6d39df4c5f4a6b7bab150a1fa00f0a1cb Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 1 Apr 2024 17:43:37 -0400
Subject: [PATCH 015/272] Refactor stream mode setup for gtests (#15337)

Setting up the stream mode logic was duplicated in `testing_main.hpp` and `error_handing_test.cu`.
Refactoring the logic will help setup for a large strings test fixture in a follow-on PR.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - https://github.com/nvdbaranec
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/15337
---
 cpp/include/cudf_test/testing_main.hpp | 57 ++++++++++++++++----------
 cpp/tests/error/error_handling_test.cu | 14 +------
 2 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp
index 88e3088d794..ecac761f7cb 100644
--- a/cpp/include/cudf_test/testing_main.hpp
+++ b/cpp/include/cudf_test/testing_main.hpp
@@ -145,6 +145,32 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
   }
 }
 
+/**
+ * @brief Sets up stream mode memory resource adaptor
+ *
+ * The resource adaptor is only set as the current device resource if the
+ * stream mode is enabled.
+ *
+ * The caller must keep the return object alive for the life of the test runs.
+ *
+ * @param cmd_opts Command line options returned by parse_cudf_test_opts
+ * @return Memory resource adaptor
+ */
+inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts)
+{
+  auto resource                      = rmm::mr::get_current_device_resource();
+  auto const stream_mode             = cmd_opts["stream_mode"].as<std::string>();
+  auto const stream_error_mode       = cmd_opts["stream_error_mode"].as<std::string>();
+  auto const error_on_invalid_stream = (stream_error_mode == "error");
+  auto const check_default_stream    = (stream_mode == "new_cudf_default");
+  auto adaptor =
+    make_stream_checking_resource_adaptor(resource, error_on_invalid_stream, check_default_stream);
+  if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) {
+    rmm::mr::set_current_device_resource(&adaptor);
+  }
+  return adaptor;
+}
+
 /**
  * @brief Macro that defines main function for gtest programs that use rmm
  *
@@ -155,25 +181,14 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
  * function parses the command line to customize test behavior, like the
  * allocation mode used for creating the default memory resource.
  */
-#define CUDF_TEST_PROGRAM_MAIN()                                                              \
-  int main(int argc, char** argv)                                                             \
-  {                                                                                           \
-    ::testing::InitGoogleTest(&argc, argv);                                                   \
-    auto const cmd_opts = parse_cudf_test_opts(argc, argv);                                   \
-    auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();                             \
-    auto resource       = cudf::test::create_memory_resource(rmm_mode);                       \
-    rmm::mr::set_current_device_resource(resource.get());                                     \
-                                                                                              \
-    auto const stream_mode = cmd_opts["stream_mode"].as<std::string>();                       \
-    if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) {      \
-      auto const stream_error_mode       = cmd_opts["stream_error_mode"].as<std::string>();   \
-      auto const error_on_invalid_stream = (stream_error_mode == "error");                    \
-      auto const check_default_stream    = (stream_mode == "new_cudf_default");               \
-      auto adaptor                       = make_stream_checking_resource_adaptor(             \
-        resource.get(), error_on_invalid_stream, check_default_stream); \
-      rmm::mr::set_current_device_resource(&adaptor);                                         \
-      return RUN_ALL_TESTS();                                                                 \
-    }                                                                                         \
-                                                                                              \
-    return RUN_ALL_TESTS();                                                                   \
+#define CUDF_TEST_PROGRAM_MAIN()                                        \
+  int main(int argc, char** argv)                                       \
+  {                                                                     \
+    ::testing::InitGoogleTest(&argc, argv);                             \
+    auto const cmd_opts = parse_cudf_test_opts(argc, argv);             \
+    auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();       \
+    auto resource       = cudf::test::create_memory_resource(rmm_mode); \
+    rmm::mr::set_current_device_resource(resource.get());               \
+    auto adaptor = make_stream_mode_adaptor(cmd_opts);                  \
+    return RUN_ALL_TESTS();                                             \
   }
diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu
index 674d2e0a6ea..46d01ec14ff 100644
--- a/cpp/tests/error/error_handling_test.cu
+++ b/cpp/tests/error/error_handling_test.cu
@@ -128,17 +128,7 @@ TEST(DebugAssert, cudf_assert_true)
 int main(int argc, char** argv)
 {
   ::testing::InitGoogleTest(&argc, argv);
-  auto const cmd_opts    = parse_cudf_test_opts(argc, argv);
-  auto const stream_mode = cmd_opts["stream_mode"].as<std::string>();
-  if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) {
-    auto resource                      = rmm::mr::get_current_device_resource();
-    auto const stream_error_mode       = cmd_opts["stream_error_mode"].as<std::string>();
-    auto const error_on_invalid_stream = (stream_error_mode == "error");
-    auto const check_default_stream    = (stream_mode == "new_cudf_default");
-    auto adaptor                       = make_stream_checking_resource_adaptor(
-      resource, error_on_invalid_stream, check_default_stream);
-    rmm::mr::set_current_device_resource(&adaptor);
-    return RUN_ALL_TESTS();
-  }
+  auto const cmd_opts = parse_cudf_test_opts(argc, argv);
+  auto adaptor        = make_stream_mode_adaptor(cmd_opts);
   return RUN_ALL_TESTS();
 }

From 09f8c8ad92b5b59a4525ee256feca6a68564b003 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 1 Apr 2024 17:23:28 -0500
Subject: [PATCH 016/272] Enable ``dask_cudf`` json and s3 tests with
 query-planning on (#15408)

Addresses parts of https://github.com/rapidsai/cudf/issues/15027 (json and s3 testing).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15408
---
 python/dask_cudf/dask_cudf/backends.py        | 15 +++++++++++-
 .../dask_cudf/dask_cudf/io/tests/test_json.py |  4 ++--
 .../dask_cudf/io/tests/test_parquet.py        |  2 +-
 .../dask_cudf/dask_cudf/io/tests/test_s3.py   |  6 +----
 python/dask_cudf/dask_cudf/tests/utils.py     | 24 +++++++++++++++----
 5 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index c7b4a1c4c6a..d05be30602e 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -2,6 +2,7 @@
 
 import warnings
 from collections.abc import Iterator
+from functools import partial
 
 import cupy as cp
 import numpy as np
@@ -484,7 +485,6 @@ def sizeof_cudf_series_index(obj):
     def _simple_cudf_encode(_):
         # Basic pickle-based encoding for a partd k-v store
         import pickle
-        from functools import partial
 
         import partd
 
@@ -686,6 +686,19 @@ def from_dict(
             constructor=constructor,
         )
 
+    @staticmethod
+    def read_json(*args, engine="auto", **kwargs):
+        return _default_backend(
+            dd.read_json,
+            *args,
+            engine=(
+                partial(cudf.read_json, engine=engine)
+                if isinstance(engine, str)
+                else engine
+            ),
+            **kwargs,
+        )
+
 
 # Import/register cudf-specific classes for dask-expr
 try:
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py
index a2b1d7fc114..8dcf3f05e89 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_json.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py
@@ -12,8 +12,8 @@
 import dask_cudf
 from dask_cudf.tests.utils import skip_dask_expr
 
-# No dask-expr support
-pytestmark = skip_dask_expr()
+# No dask-expr support for dask_expr<=1.0.5
+pytestmark = skip_dask_expr(lt_version="1.0.5+a")
 
 
 def test_read_json_backend_dispatch(tmp_path):
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index de2a735b2ce..df41ef77b7c 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -535,7 +535,7 @@ def test_check_file_size(tmpdir):
         dask_cudf.io.read_parquet(fn, check_file_size=1).compute()
 
 
-@xfail_dask_expr("HivePartitioning cannot be hashed")
+@xfail_dask_expr("HivePartitioning cannot be hashed", lt_version="1.0")
 def test_null_partition(tmpdir):
     import pyarrow as pa
     from pyarrow.dataset import HivePartitioning
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index f4a6fabdb60..a67404da4fe 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -10,10 +10,6 @@
 import pytest
 
 import dask_cudf
-from dask_cudf.tests.utils import skip_dask_expr
-
-# No dask-expr support
-pytestmark = skip_dask_expr()
 
 moto = pytest.importorskip("moto", minversion="3.1.6")
 boto3 = pytest.importorskip("boto3")
@@ -111,7 +107,7 @@ def test_read_csv(s3_base, s3so):
         s3_base=s3_base, bucket="daskcsv", files={"a.csv": b"a,b\n1,2\n3,4\n"}
     ):
         df = dask_cudf.read_csv(
-            "s3://daskcsv/*.csv", chunksize="50 B", storage_options=s3so
+            "s3://daskcsv/*.csv", blocksize="50 B", storage_options=s3so
         )
         assert df.a.sum().compute() == 4
 
diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py
index e838b8d63bc..1ca1758736b 100644
--- a/python/dask_cudf/dask_cudf/tests/utils.py
+++ b/python/dask_cudf/dask_cudf/tests/utils.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+from packaging.version import Version
 
 import dask.dataframe as dd
 
@@ -10,6 +11,13 @@
 
 from dask_cudf.expr import QUERY_PLANNING_ON
 
+if QUERY_PLANNING_ON:
+    import dask_expr
+
+    DASK_EXPR_VERSION = Version(dask_expr.__version__)
+else:
+    DASK_EXPR_VERSION = None
+
 
 def _make_random_frame(nelem, npartitions=2, include_na=False):
     df = pd.DataFrame(
@@ -27,9 +35,17 @@ def _make_random_frame(nelem, npartitions=2, include_na=False):
 _default_reason = "Not compatible with dask-expr"
 
 
-def skip_dask_expr(reason=_default_reason):
-    return pytest.mark.skipif(QUERY_PLANNING_ON, reason=reason)
+def skip_dask_expr(reason=_default_reason, lt_version=None):
+    if lt_version is not None:
+        skip = QUERY_PLANNING_ON and DASK_EXPR_VERSION < Version(lt_version)
+    else:
+        skip = QUERY_PLANNING_ON
+    return pytest.mark.skipif(skip, reason=reason)
 
 
-def xfail_dask_expr(reason=_default_reason):
-    return pytest.mark.xfail(QUERY_PLANNING_ON, reason=reason)
+def xfail_dask_expr(reason=_default_reason, lt_version=None):
+    if lt_version is not None:
+        xfail = QUERY_PLANNING_ON and DASK_EXPR_VERSION < Version(lt_version)
+    else:
+        xfail = QUERY_PLANNING_ON
+    return pytest.mark.xfail(xfail, reason=reason)

From 268996ad101dc69414992aa0227eba4f93012c91 Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Mon, 1 Apr 2024 18:59:48 -0400
Subject: [PATCH 017/272] Add `to_arrow_device` function to cudf interop using
 nanoarrow (#15047)

Introduce new `to_arrow_device` and `to_arrow_schema` functions to utilize the `ArrowDeviceArray` structure for zero-copy passing of libcudf::table.

Add nanoarrow as a vendored lib and a script to update it.

Initial step towards addressing #14926

Authors:
  - Matt Topol (https://github.com/zeroshade)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/15047
---
 cpp/CMakeLists.txt                         |   8 +-
 cpp/cmake/thirdparty/get_nanoarrow.cmake   |  36 +
 cpp/include/cudf/interop.hpp               |  96 ++-
 cpp/include/cudf/interop/detail/arrow.hpp  |  48 ++
 cpp/src/interop/to_arrow_device.cu         | 727 ++++++++++++++++++++
 cpp/tests/CMakeLists.txt                   |   7 +-
 cpp/tests/interop/nanoarrow_utils.hpp      | 226 +++++++
 cpp/tests/interop/to_arrow_device_test.cpp | 739 +++++++++++++++++++++
 docs/cudf/source/conf.py                   |   1 +
 9 files changed, 1882 insertions(+), 6 deletions(-)
 create mode 100644 cpp/cmake/thirdparty/get_nanoarrow.cmake
 create mode 100644 cpp/include/cudf/interop/detail/arrow.hpp
 create mode 100644 cpp/src/interop/to_arrow_device.cu
 create mode 100644 cpp/tests/interop/nanoarrow_utils.hpp
 create mode 100644 cpp/tests/interop/to_arrow_device_test.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 618d03f7078..f1d43e3c35f 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -210,12 +210,14 @@ include(cmake/thirdparty/get_kvikio.cmake)
 include(cmake/thirdparty/get_fmt.cmake)
 # find spdlog
 include(cmake/thirdparty/get_spdlog.cmake)
+# find nanoarrow
+include(cmake/thirdparty/get_nanoarrow.cmake)
 
 # Workaround until https://github.com/rapidsai/rapids-cmake/issues/176 is resolved
 if(NOT BUILD_SHARED_LIBS)
   include("${rapids-cmake-dir}/export/find_package_file.cmake")
   list(APPEND METADATA_KINDS BUILD INSTALL)
-  list(APPEND dependencies KvikIO ZLIB nvcomp)
+  list(APPEND dependencies KvikIO ZLIB nvcomp nanoarrow)
   if(TARGET cufile::cuFile_interface)
     list(APPEND dependencies cuFile)
   endif()
@@ -358,6 +360,7 @@ add_library(
   src/interop/dlpack.cpp
   src/interop/from_arrow.cu
   src/interop/to_arrow.cu
+  src/interop/to_arrow_device.cu
   src/interop/detail/arrow_allocator.cpp
   src/io/avro/avro.cpp
   src/io/avro/avro_gpu.cu
@@ -735,6 +738,7 @@ target_include_directories(
          "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/include>"
          "$<BUILD_INTERFACE:${CUDF_GENERATED_INCLUDE_DIR}/include>"
   PRIVATE "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
+          "$<BUILD_INTERFACE:${nanoarrow_SOURCE_DIR}/src>"
   INTERFACE "$<INSTALL_INTERFACE:include>"
 )
 
@@ -783,7 +787,7 @@ target_link_libraries(
   cudf
   PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm
   PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3-cpp> cuco::cuco ZLIB::ZLIB nvcomp::nvcomp kvikio::kvikio
-          $<TARGET_NAME_IF_EXISTS:cuFile_interface>
+          $<TARGET_NAME_IF_EXISTS:cuFile_interface> nanoarrow
 )
 
 # Add Conda library, and include paths if specified
diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake
new file mode 100644
index 00000000000..be938a89ccd
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake
@@ -0,0 +1,36 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# This function finds nanoarrow and sets any additional necessary environment variables.
+function(find_and_configure_nanoarrow)
+  set(oneValueArgs VERSION FORK PINNED_TAG)
+  cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  rapids_cpm_find(
+    nanoarrow ${PKG_VERSION}
+    GLOBAL_TARGETS nanoarrow
+    CPM_ARGS
+    GIT_REPOSITORY https://github.com/${PKG_FORK}/arrow-nanoarrow.git
+    GIT_TAG ${PKG_PINNED_TAG}
+    # TODO: Commit hashes are not supported with shallow clones. Can switch this if and when we pin
+    # to an actual tag.
+    GIT_SHALLOW FALSE
+    OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf"
+  )
+  set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endfunction()
+
+find_and_configure_nanoarrow(
+  VERSION 0.4.0 FORK apache PINNED_TAG c97720003ff863b81805bcdb9f7c91306ab6b6a8
+)
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 2ee6f19614d..871f48e3aac 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,11 +34,16 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
 struct DLManagedTensor;
 
+struct ArrowDeviceArray;
+
+struct ArrowSchema;
+
 namespace cudf {
 /**
  * @addtogroup interop_dlpack
@@ -162,6 +167,95 @@ std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
                                         column_metadata const& metadata = {},
                                         rmm::cuda_stream_view stream = cudf::get_default_stream(),
                                         arrow::MemoryPool* ar_mr = arrow::default_memory_pool());
+
+/**
+ * @brief typedef for a unique_ptr to an ArrowSchema with custom deleter
+ *
+ */
+using unique_schema_t = std::unique_ptr<ArrowSchema, void (*)(ArrowSchema*)>;
+
+/**
+ * @brief typedef for a unique_ptr to an ArrowDeviceArray with a custom deleter
+ *
+ */
+using unique_device_array_t = std::unique_ptr<ArrowDeviceArray, void (*)(ArrowDeviceArray*)>;
+
+/**
+ * @brief Create ArrowSchema from cudf table and metadata
+ *
+ * Populates and returns an ArrowSchema C struct using a table and metadata.
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf,
+ * decimals will be converted to an Arrow decimal128 which has the widest precision that cudf
+ * decimal type supports. For example, `numeric::decimal32` will be converted to Arrow decimal128
+ * with the precision of 9 which is the maximum precision for 32-bit types. Similarly,
+ * `numeric::decimal128` will be converted to Arrow decimal128 with the precision of 38.
+ *
+ * @param input Table to create a schema from
+ * @param metadata Contains the hierarchy of names of columns and children
+ * @return ArrowSchema generated from `input`
+ */
+unique_schema_t to_arrow_schema(cudf::table_view const& input,
+                                cudf::host_span<column_metadata const> metadata);
+
+/**
+ * @brief Create `ArrowDeviceArray` from cudf table and metadata
+ *
+ * Populates the C struct ArrowDeviceArray without performing copies if possible.
+ * This maintains the data on the GPU device and gives ownership of the table
+ * and its buffers to the ArrowDeviceArray struct.
+ *
+ * After calling this function, the release callback on the returned ArrowDeviceArray
+ * must be called to clean up the memory.
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf
+ * it will be converted to an Arrow decimal128 with the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
+ * converted to Arrow decimal128 of the precision 38.
+ *
+ * @note Copies will be performed in the cases where cudf differs from Arrow
+ * such as in the representation of bools (Arrow uses a bitmap, cudf uses 1-byte per value).
+ *
+ * @param table Input table, ownership of the data will be moved to the result
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used for any allocations during conversion
+ * @return ArrowDeviceArray which will have ownership of the GPU data, consumer must call release
+ */
+unique_device_array_t to_arrow_device(
+  cudf::table&& table,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Create `ArrowDeviceArray` from cudf column and metadata
+ *
+ * Populates the C struct ArrowDeviceArray without performing copies if possible.
+ * This maintains the data on the GPU device and gives ownership of the table
+ * and its buffers to the ArrowDeviceArray struct.
+ *
+ * After calling this function, the release callback on the returned ArrowDeviceArray
+ * must be called to clean up the memory.
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf
+ * it will be converted to an Arrow decimal128 with the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similar, numeric::decimal128 will be
+ * converted to Arrow decimal128 of the precision 38.
+ *
+ * @note Copies will be performed in the cases where cudf differs from Arrow such as
+ * in the representation of bools (Arrow uses a bitmap, cudf uses 1 byte per value).
+ *
+ * @param col Input column, ownership of the data will be moved to the result
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used for any allocations during conversion
+ * @return ArrowDeviceArray which will have ownership of the GPU data
+ */
+unique_device_array_t to_arrow_device(
+  cudf::column&& col,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Create `cudf::table` from given arrow Table input
  *
diff --git a/cpp/include/cudf/interop/detail/arrow.hpp b/cpp/include/cudf/interop/detail/arrow.hpp
new file mode 100644
index 00000000000..8043ecf5422
--- /dev/null
+++ b/cpp/include/cudf/interop/detail/arrow.hpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanoarrow/nanoarrow.hpp>
+
+// from Arrow C Device Data Interface
+// https://arrow.apache.org/docs/format/CDeviceDataInterface.html
+#ifndef ARROW_C_DEVICE_DATA_INTERFACE
+#define ARROW_C_DEVICE_DATA_INTERFACE
+
+// Device type for the allocated memory
+typedef int32_t ArrowDeviceType;
+
+// CPU device, same as using ArrowArray directly
+#define ARROW_DEVICE_CPU 1
+// CUDA GPU Device
+#define ARROW_DEVICE_CUDA 2
+// Pinned CUDA CPU memory by cudaMallocHost
+#define ARROW_DEVICE_CUDA_HOST 3
+// CUDA managed/unified memory allocated by cudaMallocManaged
+#define ARROW_DEVICE_CUDA_MANAGED 13
+
+struct ArrowDeviceArray {
+  struct ArrowArray array;
+  int64_t device_id;
+  ArrowDeviceType device_type;
+  void* sync_event;
+
+  // reserved bytes for future expansion
+  int64_t reserved[3];
+};
+
+#endif  // ARROW_C_DEVICE_DATA_INTERFACE
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
new file mode 100644
index 00000000000..e824412e71c
--- /dev/null
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -0,0 +1,727 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/interop.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/interop/detail/arrow.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+
+#include <nanoarrow/nanoarrow.h>
+#include <nanoarrow/nanoarrow.hpp>
+
+namespace cudf {
+namespace detail {
+namespace {
+static constexpr int validity_buffer_idx         = 0;
+static constexpr int fixed_width_data_buffer_idx = 1;
+
+ArrowType id_to_arrow_type(cudf::type_id id)
+{
+  switch (id) {
+    case cudf::type_id::BOOL8: return NANOARROW_TYPE_BOOL;
+    case cudf::type_id::INT8: return NANOARROW_TYPE_INT8;
+    case cudf::type_id::INT16: return NANOARROW_TYPE_INT16;
+    case cudf::type_id::INT32: return NANOARROW_TYPE_INT32;
+    case cudf::type_id::INT64: return NANOARROW_TYPE_INT64;
+    case cudf::type_id::UINT8: return NANOARROW_TYPE_UINT8;
+    case cudf::type_id::UINT16: return NANOARROW_TYPE_UINT16;
+    case cudf::type_id::UINT32: return NANOARROW_TYPE_UINT32;
+    case cudf::type_id::UINT64: return NANOARROW_TYPE_UINT64;
+    case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT;
+    case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE;
+    case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32;
+    default: CUDF_FAIL("Unsupported type_id conversion to arrow type");
+  }
+}
+
+struct dispatch_to_arrow_type {
+  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
+  int operator()(column_view, column_metadata const&, ArrowSchema*)
+  {
+    CUDF_FAIL("Unsupported type for to_arrow_schema");
+  }
+
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
+  int operator()(column_view input_view, column_metadata const&, ArrowSchema* out)
+  {
+    cudf::type_id id = input_view.type().id();
+    switch (id) {
+      case cudf::type_id::TIMESTAMP_SECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_SECOND, nullptr);
+      case cudf::type_id::TIMESTAMP_MILLISECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr);
+      case cudf::type_id::TIMESTAMP_MICROSECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MICRO, nullptr);
+      case cudf::type_id::TIMESTAMP_NANOSECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_NANO, nullptr);
+      case cudf::type_id::DURATION_SECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_SECOND, nullptr);
+      case cudf::type_id::DURATION_MILLISECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_MILLI, nullptr);
+      case cudf::type_id::DURATION_MICROSECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_MICRO, nullptr);
+      case cudf::type_id::DURATION_NANOSECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_NANO, nullptr);
+      default: return ArrowSchemaSetType(out, id_to_arrow_type(id));
+    }
+  }
+};
+
+template <typename DeviceType>
+int decimals_to_arrow(column_view input, ArrowSchema* out)
+{
+  // Arrow doesn't support decimal32/decimal64 currently. decimal128
+  // is the smallest that arrow supports besides float32/float64 so we
+  // upcast to decimal128.
+  return ArrowSchemaSetTypeDecimal(out,
+                                   NANOARROW_TYPE_DECIMAL128,
+                                   cudf::detail::max_precision<DeviceType>(),
+                                   -input.type().scale());
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<numeric::decimal32>(column_view input,
+                                                           column_metadata const&,
+                                                           ArrowSchema* out)
+{
+  using DeviceType = int32_t;
+  return decimals_to_arrow<DeviceType>(input, out);
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<numeric::decimal64>(column_view input,
+                                                           column_metadata const&,
+                                                           ArrowSchema* out)
+{
+  using DeviceType = int64_t;
+  return decimals_to_arrow<DeviceType>(input, out);
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<numeric::decimal128>(column_view input,
+                                                            column_metadata const&,
+                                                            ArrowSchema* out)
+{
+  using DeviceType = __int128_t;
+  return decimals_to_arrow<DeviceType>(input, out);
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::string_view>(column_view input,
+                                                          column_metadata const&,
+                                                          ArrowSchema* out)
+{
+  return ArrowSchemaSetType(out, NANOARROW_TYPE_STRING);
+}
+
+// these forward declarations are needed due to the recursive calls to them
+// inside their definitions and in struct_vew for handling children
+template <>
+int dispatch_to_arrow_type::operator()<cudf::list_view>(column_view input,
+                                                        column_metadata const& metadata,
+                                                        ArrowSchema* out);
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::dictionary32>(column_view input,
+                                                           column_metadata const& metadata,
+                                                           ArrowSchema* out);
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::struct_view>(column_view input,
+                                                          column_metadata const& metadata,
+                                                          ArrowSchema* out)
+{
+  CUDF_EXPECTS(metadata.children_meta.size() == static_cast<std::size_t>(input.num_children()),
+               "Number of field names and number of children doesn't match\n");
+
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeStruct(out, input.num_children()));
+  for (int i = 0; i < input.num_children(); ++i) {
+    auto child = out->children[i];
+    auto col   = input.child(i);
+    ArrowSchemaInit(child);
+    NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(child, metadata.children_meta[i].name.c_str()));
+
+    child->flags = col.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
+
+    if (col.type().id() == cudf::type_id::EMPTY) {
+      NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(child, NANOARROW_TYPE_NA));
+      continue;
+    }
+
+    NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+      col.type(), detail::dispatch_to_arrow_type{}, col, metadata.children_meta[i], child));
+  }
+
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::list_view>(column_view input,
+                                                        column_metadata const& metadata,
+                                                        ArrowSchema* out)
+{
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(out, NANOARROW_TYPE_LIST));
+  auto child = input.child(cudf::lists_column_view::child_column_index);
+  ArrowSchemaInit(out->children[0]);
+  if (child.type().id() == cudf::type_id::EMPTY) {
+    return ArrowSchemaSetType(out->children[0], NANOARROW_TYPE_NA);
+  }
+  auto child_meta =
+    metadata.children_meta.empty() ? column_metadata{"element"} : metadata.children_meta[0];
+
+  out->flags = input.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(out->children[0], child_meta.name.c_str()));
+  out->children[0]->flags = child.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
+  return cudf::type_dispatcher(
+    child.type(), detail::dispatch_to_arrow_type{}, child, child_meta, out->children[0]);
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::dictionary32>(column_view input,
+                                                           column_metadata const& metadata,
+                                                           ArrowSchema* out)
+{
+  cudf::dictionary_column_view dview{input};
+
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(out, id_to_arrow_type(dview.indices().type().id())));
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateDictionary(out));
+  ArrowSchemaInit(out->dictionary);
+
+  auto dict_keys = dview.keys();
+  return cudf::type_dispatcher(
+    dict_keys.type(),
+    detail::dispatch_to_arrow_type{},
+    dict_keys,
+    metadata.children_meta.empty() ? column_metadata{"keys"} : metadata.children_meta[0],
+    out->dictionary);
+}
+
+template <typename T>
+void device_buffer_finalize(ArrowBufferAllocator* allocator, uint8_t*, int64_t)
+{
+  auto* unique_buffer = reinterpret_cast<std::unique_ptr<T>*>(allocator->private_data);
+  delete unique_buffer;
+}
+
+template <typename>
+struct is_device_scalar : public std::false_type {};
+
+template <typename T>
+struct is_device_scalar<rmm::device_scalar<T>> : public std::true_type {};
+
+template <typename>
+struct is_device_uvector : public std::false_type {};
+
+template <typename T>
+struct is_device_uvector<rmm::device_uvector<T>> : public std::true_type {};
+
+template <typename T>
+int set_buffer(std::unique_ptr<T> device_buf, int64_t i, ArrowArray* out)
+{
+  ArrowBuffer* buf = ArrowArrayBuffer(out, i);
+  auto ptr         = reinterpret_cast<uint8_t*>(device_buf->data());
+  buf->size_bytes  = [&] {
+    if constexpr (is_device_scalar<T>::value) {
+      return sizeof(typename T::value_type);
+    } else if constexpr (is_device_uvector<T>::value) {
+      return sizeof(typename T::value_type) * device_buf->size();
+    } else {
+      return device_buf->size();
+    }
+  }();
+  // we make a new unique_ptr and move to it in case there was a custom deleter
+  NANOARROW_RETURN_NOT_OK(
+    ArrowBufferSetAllocator(buf,
+                            ArrowBufferDeallocator(&device_buffer_finalize<T>,
+                                                   new std::unique_ptr<T>(std::move(device_buf)))));
+  buf->data = ptr;
+  return NANOARROW_OK;
+}
+
+int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column const& column)
+{
+  NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(arr, storage_type));
+  arr->length     = column.size();
+  arr->null_count = column.null_count();
+  return NANOARROW_OK;
+}
+
+struct dispatch_to_arrow_device {
+  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
+  int operator()(cudf::column&&,
+                 rmm::cuda_stream_view,
+                 rmm::mr::device_memory_resource*,
+                 ArrowArray*)
+  {
+    CUDF_FAIL("Unsupported type for to_arrow_device");
+  }
+
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
+  int operator()(cudf::column&& column,
+                 rmm::cuda_stream_view stream,
+                 rmm::mr::device_memory_resource* mr,
+                 ArrowArray* out)
+  {
+    nanoarrow::UniqueArray tmp;
+
+    const ArrowType storage_type = [&] {
+      switch (column.type().id()) {
+        case cudf::type_id::TIMESTAMP_SECONDS:
+        case cudf::type_id::TIMESTAMP_MILLISECONDS:
+        case cudf::type_id::TIMESTAMP_MICROSECONDS:
+        case cudf::type_id::TIMESTAMP_NANOSECONDS: return NANOARROW_TYPE_INT64;
+        case cudf::type_id::DURATION_SECONDS:
+        case cudf::type_id::DURATION_MILLISECONDS:
+        case cudf::type_id::DURATION_MICROSECONDS:
+        case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TYPE_INT64;
+        default: return id_to_arrow_type(column.type().id());
+      }
+    }();
+    NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), storage_type, column));
+
+    auto contents = column.release();
+    if (contents.null_mask) {
+      NANOARROW_RETURN_NOT_OK(
+        set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
+    }
+
+    NANOARROW_RETURN_NOT_OK(
+      set_buffer(std::move(contents.data), fixed_width_data_buffer_idx, tmp.get()));
+
+    ArrowArrayMove(tmp.get(), out);
+    return NANOARROW_OK;
+  }
+};
+
+template <typename DeviceType>
+int decimals_to_arrow(cudf::column&& input,
+                      int32_t precision,
+                      rmm::cuda_stream_view stream,
+                      rmm::mr::device_memory_resource* mr,
+                      ArrowArray* out)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, input));
+
+  if constexpr (!std::is_same_v<DeviceType, __int128_t>) {
+    constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType);
+    auto buf =
+      std::make_unique<rmm::device_uvector<DeviceType>>(input.size() * BIT_WIDTH_RATIO, stream, mr);
+
+    auto count = thrust::make_counting_iterator(0);
+
+    thrust::for_each(rmm::exec_policy(stream, mr),
+                     count,
+                     count + input.size(),
+                     [in  = input.view().begin<DeviceType>(),
+                      out = buf->data(),
+                      BIT_WIDTH_RATIO] __device__(auto in_idx) {
+                       auto const out_idx = in_idx * BIT_WIDTH_RATIO;
+                       // the lowest order bits are the value, the remainder
+                       // simply matches the sign bit to satisfy the two's
+                       // complement integer representation of negative numbers.
+                       out[out_idx] = in[in_idx];
+#pragma unroll BIT_WIDTH_RATIO - 1
+                       for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
+                         out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
+                       }
+                     });
+    NANOARROW_RETURN_NOT_OK(set_buffer(std::move(buf), fixed_width_data_buffer_idx, tmp.get()));
+  }
+
+  auto contents = input.release();
+  if (contents.null_mask) {
+    NANOARROW_RETURN_NOT_OK(
+      set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
+  }
+
+  if constexpr (std::is_same_v<DeviceType, __int128_t>) {
+    NANOARROW_RETURN_NOT_OK(
+      set_buffer(std::move(contents.data), fixed_width_data_buffer_idx, tmp.get()));
+  }
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device::operator()<numeric::decimal32>(cudf::column&& column,
+                                                             rmm::cuda_stream_view stream,
+                                                             rmm::mr::device_memory_resource* mr,
+                                                             ArrowArray* out)
+{
+  using DeviceType = int32_t;
+  return decimals_to_arrow<DeviceType>(
+    std::move(column), cudf::detail::max_precision<DeviceType>(), stream, mr, out);
+}
+
+template <>
+int dispatch_to_arrow_device::operator()<numeric::decimal64>(cudf::column&& column,
+                                                             rmm::cuda_stream_view stream,
+                                                             rmm::mr::device_memory_resource* mr,
+                                                             ArrowArray* out)
+{
+  using DeviceType = int64_t;
+  return decimals_to_arrow<DeviceType>(
+    std::move(column), cudf::detail::max_precision<DeviceType>(), stream, mr, out);
+}
+
+template <>
+int dispatch_to_arrow_device::operator()<numeric::decimal128>(cudf::column&& column,
+                                                              rmm::cuda_stream_view stream,
+                                                              rmm::mr::device_memory_resource* mr,
+                                                              ArrowArray* out)
+{
+  using DeviceType = __int128_t;
+  return decimals_to_arrow<DeviceType>(
+    std::move(column), cudf::detail::max_precision<DeviceType>(), stream, mr, out);
+}
+
+template <>
+int dispatch_to_arrow_device::operator()<bool>(cudf::column&& column,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr,
+                                               ArrowArray* out)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_BOOL, column));
+
+  auto bitmask  = bools_to_mask(column.view(), stream, mr);
+  auto contents = column.release();
+  if (contents.null_mask) {
+    NANOARROW_RETURN_NOT_OK(
+      set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
+  }
+  NANOARROW_RETURN_NOT_OK(
+    set_buffer(std::move(bitmask.first), fixed_width_data_buffer_idx, tmp.get()));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device::operator()<cudf::string_view>(cudf::column&& column,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::mr::device_memory_resource* mr,
+                                                            ArrowArray* out)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRING, column));
+
+  if (column.size() == 0) {
+    // the scalar zero here is necessary because the spec for string arrays states
+    // that the offsets buffer should contain "length + 1" signed integers. So in
+    // the case of a 0 length string array, there should be exactly 1 value, zero,
+    // in the offsets buffer. While some arrow implementations may accept a zero-sized
+    // offsets buffer, best practices would be to allocate the buffer with the single value.
+    auto zero = std::make_unique<rmm::device_scalar<int32_t>>(0, stream, mr);
+    NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
+    ArrowArrayMove(tmp.get(), out);
+    return NANOARROW_OK;
+  }
+
+  auto contents = column.release();
+  if (contents.null_mask) {
+    NANOARROW_RETURN_NOT_OK(
+      set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
+  }
+
+  auto offsets_contents =
+    contents.children[cudf::strings_column_view::offsets_column_index]->release();
+  NANOARROW_RETURN_NOT_OK(set_buffer(std::move(offsets_contents.data), 1, tmp.get()));
+  NANOARROW_RETURN_NOT_OK(set_buffer(std::move(contents.data), 2, tmp.get()));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device::operator()<cudf::list_view>(cudf::column&& column,
+                                                          rmm::cuda_stream_view stream,
+                                                          rmm::mr::device_memory_resource* mr,
+                                                          ArrowArray* out);
+
+template <>
+int dispatch_to_arrow_device::operator()<cudf::dictionary32>(cudf::column&& column,
+                                                             rmm::cuda_stream_view stream,
+                                                             rmm::mr::device_memory_resource* mr,
+                                                             ArrowArray* out);
+
+template <>
+int dispatch_to_arrow_device::operator()<cudf::struct_view>(cudf::column&& column,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::mr::device_memory_resource* mr,
+                                                            ArrowArray* out)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRUCT, column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), column.num_children()));
+
+  auto contents = column.release();
+  if (contents.null_mask) {
+    NANOARROW_RETURN_NOT_OK(
+      set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
+  }
+
+  for (size_t i = 0; i < size_t(tmp->n_children); ++i) {
+    ArrowArray* child_ptr = tmp->children[i];
+    auto& child           = contents.children[i];
+    if (child->type().id() == cudf::type_id::EMPTY) {
+      NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(child_ptr, NANOARROW_TYPE_NA));
+      child_ptr->length     = child->size();
+      child_ptr->null_count = child->size();
+    } else {
+      NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+        child->type(), dispatch_to_arrow_device{}, std::move(*child), stream, mr, child_ptr));
+    }
+  }
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device::operator()<cudf::list_view>(cudf::column&& column,
+                                                          rmm::cuda_stream_view stream,
+                                                          rmm::mr::device_memory_resource* mr,
+                                                          ArrowArray* out)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_LIST, column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), 1));
+
+  auto contents = column.release();
+  if (contents.null_mask) {
+    NANOARROW_RETURN_NOT_OK(
+      set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
+  }
+
+  auto offsets_contents =
+    contents.children[cudf::lists_column_view::offsets_column_index]->release();
+  NANOARROW_RETURN_NOT_OK(set_buffer(std::move(offsets_contents.data), 1, tmp.get()));
+
+  auto& child = contents.children[cudf::lists_column_view::child_column_index];
+  if (child->type().id() == cudf::type_id::EMPTY) {
+    NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(tmp->children[0], NANOARROW_TYPE_NA));
+    tmp->children[0]->length     = 0;
+    tmp->children[0]->null_count = 0;
+  } else {
+    NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+      child->type(), dispatch_to_arrow_device{}, std::move(*child), stream, mr, tmp->children[0]));
+  }
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device::operator()<cudf::dictionary32>(cudf::column&& column,
+                                                             rmm::cuda_stream_view stream,
+                                                             rmm::mr::device_memory_resource* mr,
+                                                             ArrowArray* out)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(
+    tmp.get(),
+    id_to_arrow_type(column.child(cudf::dictionary_column_view::indices_column_index).type().id()),
+    column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateDictionary(tmp.get()));
+
+  auto contents = column.release();
+  if (contents.null_mask) {
+    NANOARROW_RETURN_NOT_OK(
+      set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
+  }
+
+  auto indices_contents =
+    contents.children[cudf::dictionary_column_view::indices_column_index]->release();
+  NANOARROW_RETURN_NOT_OK(
+    set_buffer(std::move(indices_contents.data), fixed_width_data_buffer_idx, tmp.get()));
+
+  auto& keys = contents.children[cudf::dictionary_column_view::keys_column_index];
+  NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+    keys->type(), dispatch_to_arrow_device{}, std::move(*keys), stream, mr, tmp->dictionary));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+struct ArrowDeviceArrayPrivateData {
+  ArrowArray parent;
+  cudaEvent_t sync_event;
+};
+
+void ArrowDeviceArrayRelease(ArrowArray* array)
+{
+  auto private_data = reinterpret_cast<ArrowDeviceArrayPrivateData*>(array->private_data);
+  cudaEventDestroy(private_data->sync_event);
+  ArrowArrayRelease(&private_data->parent);
+  delete private_data;
+  array->release = nullptr;
+}
+
+}  // namespace
+}  // namespace detail
+
+unique_schema_t to_arrow_schema(cudf::table_view const& input,
+                                cudf::host_span<column_metadata const> metadata)
+{
+  CUDF_EXPECTS((metadata.size() == static_cast<std::size_t>(input.num_columns())),
+               "columns' metadata should be equal to the number of columns in table");
+
+  nanoarrow::UniqueSchema result;
+  ArrowSchemaInit(result.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(result.get(), input.num_columns()));
+
+  for (int i = 0; i < input.num_columns(); ++i) {
+    auto child = result->children[i];
+    auto col   = input.column(i);
+    ArrowSchemaInit(child);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child, metadata[i].name.c_str()));
+    child->flags = col.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
+
+    if (col.type().id() == cudf::type_id::EMPTY) {
+      NANOARROW_THROW_NOT_OK(ArrowSchemaSetType(child, NANOARROW_TYPE_NA));
+      continue;
+    }
+
+    NANOARROW_THROW_NOT_OK(
+      cudf::type_dispatcher(col.type(), detail::dispatch_to_arrow_type{}, col, metadata[i], child));
+  }
+
+  unique_schema_t out(new ArrowSchema, [](ArrowSchema* schema) {
+    if (schema->release != nullptr) { ArrowSchemaRelease(schema); }
+    delete schema;
+  });
+  result.move(out.get());
+  return out;
+}
+
+unique_device_array_t to_arrow_device(cudf::table&& table,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_STRUCT));
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), table.num_columns()));
+  tmp->length     = table.num_rows();
+  tmp->null_count = 0;
+
+  auto cols = table.release();
+  for (size_t i = 0; i < cols.size(); ++i) {
+    auto child = tmp->children[i];
+    auto col   = cols[i].get();
+
+    if (col->type().id() == cudf::type_id::EMPTY) {
+      NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(child, NANOARROW_TYPE_NA));
+      child->length     = col->size();
+      child->null_count = col->size();
+      continue;
+    }
+
+    NANOARROW_THROW_NOT_OK(cudf::type_dispatcher(
+      col->type(), detail::dispatch_to_arrow_device{}, std::move(*col), stream, mr, child));
+  }
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(tmp.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr));
+
+  auto private_data = std::make_unique<detail::ArrowDeviceArrayPrivateData>();
+  cudaEventCreate(&private_data->sync_event);
+
+  auto status = cudaEventRecord(private_data->sync_event, stream);
+  if (status != cudaSuccess) { CUDF_FAIL("could not create event to sync on"); }
+
+  ArrowArrayMove(tmp.get(), &private_data->parent);
+  unique_device_array_t result(new ArrowDeviceArray, [](ArrowDeviceArray* arr) {
+    if (arr->array.release != nullptr) { ArrowArrayRelease(&arr->array); }
+    delete arr;
+  });
+  result->device_id          = rmm::get_current_cuda_device().value();
+  result->device_type        = ARROW_DEVICE_CUDA;
+  result->sync_event         = &private_data->sync_event;
+  result->array              = private_data->parent;
+  result->array.private_data = private_data.release();
+  result->array.release      = &detail::ArrowDeviceArrayRelease;
+  return result;
+}
+
+unique_device_array_t to_arrow_device(cudf::column&& col,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  nanoarrow::UniqueArray tmp;
+  if (col.type().id() == cudf::type_id::EMPTY) {
+    NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_NA));
+    tmp->length     = col.size();
+    tmp->null_count = col.size();
+  }
+
+  NANOARROW_THROW_NOT_OK(cudf::type_dispatcher(
+    col.type(), detail::dispatch_to_arrow_device{}, std::move(col), stream, mr, tmp.get()));
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(tmp.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr));
+
+  auto private_data = std::make_unique<detail::ArrowDeviceArrayPrivateData>();
+  cudaEventCreate(&private_data->sync_event);
+
+  auto status = cudaEventRecord(private_data->sync_event, stream);
+  if (status != cudaSuccess) { CUDF_FAIL("could not create event to sync on"); }
+
+  ArrowArrayMove(tmp.get(), &private_data->parent);
+  unique_device_array_t result(new ArrowDeviceArray, [](ArrowDeviceArray* arr) {
+    if (arr->array.release != nullptr) { ArrowArrayRelease(&arr->array); }
+    delete arr;
+  });
+  result->device_id          = rmm::get_current_cuda_device().value();
+  result->device_type        = ARROW_DEVICE_CUDA;
+  result->sync_event         = &private_data->sync_event;
+  result->array              = private_data->parent;
+  result->array.private_data = private_data.release();
+  result->array.release      = &detail::ArrowDeviceArrayRelease;
+  return result;
+}
+
+}  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 9dbf278c71d..053fcc0989a 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -24,7 +24,7 @@ rapids_test_init()
 # properties and linking to build the test
 function(ConfigureTest CMAKE_TEST_NAME)
   set(options)
-  set(one_value GPUS PERCENT STREAM_MODE)
+  set(one_value GPUS PERCENT STREAM_MODE EXTRA_LIB)
   set(multi_value)
   cmake_parse_arguments(_CUDF_TEST "${options}" "${one_value}" "${multi_value}" ${ARGN})
   if(NOT DEFINED _CUDF_TEST_GPUS AND NOT DEFINED _CUDF_TEST_PERCENT)
@@ -56,7 +56,7 @@ function(ConfigureTest CMAKE_TEST_NAME)
 
   target_link_libraries(
     ${CMAKE_TEST_NAME} PRIVATE cudftestutil GTest::gmock_main GTest::gtest_main nvtx3-cpp
-                               $<TARGET_NAME_IF_EXISTS:conda_env>
+                               $<TARGET_NAME_IF_EXISTS:conda_env> "${_CUDF_TEST_EXTRA_LIB}"
   )
   rapids_cuda_set_runtime(${CMAKE_TEST_NAME} USE_STATIC ${CUDA_STATIC_RUNTIME})
   rapids_test_add(
@@ -267,7 +267,8 @@ ConfigureTest(
 # ##################################################################################################
 # * interop tests -------------------------------------------------------------------------
 ConfigureTest(
-  INTEROP_TEST interop/to_arrow_test.cpp interop/from_arrow_test.cpp interop/dlpack_test.cpp
+  INTEROP_TEST interop/to_arrow_device_test.cpp interop/to_arrow_test.cpp
+  interop/from_arrow_test.cpp interop/dlpack_test.cpp EXTRA_LIB nanoarrow
 )
 
 # ##################################################################################################
diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp
new file mode 100644
index 00000000000..e7ffa9e40f4
--- /dev/null
+++ b/cpp/tests/interop/nanoarrow_utils.hpp
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/interop/detail/arrow.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/transform.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/traits.hpp>
+
+// no-op allocator/deallocator to set into ArrowArray buffers that we don't
+// want to own their buffers.
+static ArrowBufferAllocator noop_alloc = (struct ArrowBufferAllocator){
+  .reallocate = [](ArrowBufferAllocator*, uint8_t* ptr, int64_t, int64_t) -> uint8_t* {
+    return ptr;
+  },
+  .free         = [](ArrowBufferAllocator*, uint8_t*, int64_t) {},
+  .private_data = nullptr,
+};
+
+// populate the ArrowArray by copying host data buffers for fixed width types other
+// than boolean.
+template <typename T>
+std::enable_if_t<cudf::is_fixed_width<T>() and !std::is_same_v<T, bool>, void> get_nanoarrow_array(
+  ArrowArray* arr, std::vector<T> const& data, std::vector<uint8_t> const& mask = {})
+{
+  arr->length = data.size();
+  NANOARROW_THROW_NOT_OK(
+    ArrowBufferAppend(ArrowArrayBuffer(arr, 1), data.data(), sizeof(T) * data.size()));
+  if (!mask.empty()) {
+    NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arr), mask.size()));
+    ArrowBitmapAppendInt8Unsafe(
+      ArrowArrayValidityBitmap(arr), reinterpret_cast<const int8_t*>(mask.data()), mask.size());
+    arr->null_count = ArrowBitCountSet(ArrowArrayValidityBitmap(arr)->buffer.data, 0, data.size());
+  } else {
+    arr->null_count = 0;
+  }
+
+  CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arr, nullptr) == NANOARROW_OK,
+               "failed to construct array");
+}
+
+// populate an ArrowArray with pointers to the raw device buffers of a cudf::column_view
+// and use the no-op alloc so that the ArrowArray doesn't presume ownership of the data
+template <typename T>
+std::enable_if_t<cudf::is_fixed_width<T>() and !std::is_same_v<T, bool>, void> populate_from_col(
+  ArrowArray* arr, cudf::column_view view)
+{
+  arr->length     = view.size();
+  arr->null_count = view.null_count();
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  ArrowArrayValidityBitmap(arr)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
+  ArrowArrayBuffer(arr, 1)->data = const_cast<uint8_t*>(view.data<uint8_t>());
+}
+
+// populate an ArrowArray with boolean data by generating the appropriate
+// bitmaps to copy the data.
+template <typename T>
+std::enable_if_t<std::is_same_v<T, bool>, void> get_nanoarrow_array(
+  ArrowArray* arr, std::vector<bool> const& data, std::vector<bool> const& mask = {})
+{
+  ArrowBitmap bool_data;
+  ArrowBitmapInit(&bool_data);
+  NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&bool_data, data.size()));
+  std::for_each(data.begin(), data.end(), [&](const auto&& elem) {
+    NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&bool_data, (elem) ? 1 : 0, 1));
+  });
+  NANOARROW_THROW_NOT_OK(ArrowArraySetBuffer(arr, 1, &bool_data.buffer));
+
+  if (!mask.empty()) {
+    NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arr), mask.size()));
+    std::for_each(mask.begin(), mask.end(), [&](const auto&& elem) {
+      NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(arr), (elem) ? 1 : 0, 1));
+    });
+    arr->null_count = ArrowBitCountSet(ArrowArrayValidityBitmap(arr)->buffer.data, 0, data.size());
+  } else {
+    arr->null_count = 0;
+  }
+
+  CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arr, nullptr) == NANOARROW_OK,
+               "failed to construct boolean array");
+}
+
+// populate an ArrowArray from a boolean cudf column. Since Arrow and cudf
+// still represent boolean arrays differently, we have to use bools_to_mask
+// and give the ArrowArray object ownership of the device data.
+template <typename T>
+std::enable_if_t<std::is_same_v<T, bool>, void> populate_from_col(ArrowArray* arr,
+                                                                  cudf::column_view view)
+{
+  arr->length     = view.size();
+  arr->null_count = view.null_count();
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  ArrowArrayValidityBitmap(arr)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
+
+  auto bitmask = cudf::bools_to_mask(view);
+  auto ptr     = reinterpret_cast<uint8_t*>(bitmask.first->data());
+  ArrowBufferSetAllocator(
+    ArrowArrayBuffer(arr, 1),
+    ArrowBufferDeallocator(
+      [](ArrowBufferAllocator* alloc, uint8_t*, int64_t) {
+        auto buf = reinterpret_cast<std::unique_ptr<rmm::device_buffer>*>(alloc->private_data);
+        delete buf;
+      },
+      new std::unique_ptr<rmm::device_buffer>(std::move(bitmask.first))));
+  ArrowArrayBuffer(arr, 1)->data = ptr;
+}
+
+// populate an ArrowArray by copying the string data and constructing the offsets
+// buffer.
+template <typename T>
+std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> get_nanoarrow_array(
+  ArrowArray* arr, std::vector<std::string> const& data, std::vector<uint8_t> const& mask = {})
+{
+  NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(arr));
+  for (auto& str : data) {
+    NANOARROW_THROW_NOT_OK(ArrowArrayAppendString(arr, ArrowCharView(str.c_str())));
+  }
+
+  if (!mask.empty()) {
+    ArrowBitmapReset(ArrowArrayValidityBitmap(arr));
+    NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arr), mask.size()));
+    ArrowBitmapAppendInt8Unsafe(
+      ArrowArrayValidityBitmap(arr), reinterpret_cast<const int8_t*>(mask.data()), mask.size());
+    arr->null_count = ArrowBitCountSet(ArrowArrayValidityBitmap(arr)->buffer.data, 0, data.size());
+  } else {
+    arr->null_count = 0;
+  }
+
+  CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arr, nullptr) == NANOARROW_OK,
+               "failed to construct string array");
+}
+
+// populate an ArrowArray with the string data buffers of a cudf column_view
+// using no-op allocator so the ArrowArray knows it doesn't have ownership
+// of the device buffers.
+template <typename T>
+std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> populate_from_col(
+  ArrowArray* arr, cudf::column_view view)
+{
+  arr->length     = view.size();
+  arr->null_count = view.null_count();
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  ArrowArrayValidityBitmap(arr)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
+
+  cudf::strings_column_view sview{view};
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
+  ArrowArrayBuffer(arr, 1)->data = const_cast<uint8_t*>(sview.offsets().data<uint8_t>());
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 2), noop_alloc);
+  ArrowArrayBuffer(arr, 2)->data = const_cast<uint8_t*>(view.data<uint8_t>());
+}
+
+// populate a dictionary ArrowArray by delegating the copying of the indices
+// and key arrays
+template <typename KEY_TYPE, typename IND_TYPE>
+void get_nanoarrow_dict_array(ArrowArray* arr,
+                              std::vector<KEY_TYPE> const& keys,
+                              std::vector<IND_TYPE> const& ind,
+                              std::vector<uint8_t> const& validity = {})
+{
+  get_nanoarrow_array<KEY_TYPE>(arr->dictionary, keys);
+  get_nanoarrow_array<IND_TYPE>(arr, ind, validity);
+}
+
+// populate a list ArrowArray by copying the offsets and data buffers
+template <typename T>
+void get_nanoarrow_list_array(ArrowArray* arr,
+                              std::vector<T> data,
+                              std::vector<int32_t> offsets,
+                              std::vector<uint8_t> data_validity = {},
+                              std::vector<uint8_t> list_validity = {})
+{
+  get_nanoarrow_array<T>(arr->children[0], data, data_validity);
+
+  arr->length = offsets.size() - 1;
+  NANOARROW_THROW_NOT_OK(
+    ArrowBufferAppend(ArrowArrayBuffer(arr, 1), offsets.data(), sizeof(int32_t) * offsets.size()));
+  if (!list_validity.empty()) {
+    NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arr), list_validity.size()));
+    ArrowBitmapAppendInt8Unsafe(ArrowArrayValidityBitmap(arr),
+                                reinterpret_cast<const int8_t*>(list_validity.data()),
+                                arr->length);
+    arr->null_count = ArrowBitCountSet(ArrowArrayValidityBitmap(arr)->buffer.data, 0, arr->length);
+  } else {
+    arr->null_count = 0;
+  }
+
+  CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arr, nullptr) == NANOARROW_OK,
+               "failed to construct list array");
+}
+
+// populate an ArrowArray list array from device buffers using a no-op
+// allocator so that the ArrowArray doesn't have ownership of the buffers
+void populate_list_from_col(ArrowArray* arr, cudf::lists_column_view view)
+{
+  arr->length     = view.size();
+  arr->null_count = view.null_count();
+
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  ArrowArrayValidityBitmap(arr)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
+
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
+  ArrowArrayBuffer(arr, 1)->data = const_cast<uint8_t*>(view.offsets().data<uint8_t>());
+}
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
new file mode 100644
index 00000000000..243aa4e81af
--- /dev/null
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -0,0 +1,739 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nanoarrow_utils.hpp"
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/testing_main.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/interop.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/dictionary/encode.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/interop/detail/arrow.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+using vector_of_columns = std::vector<std::unique_ptr<cudf::column>>;
+
+std::tuple<std::unique_ptr<cudf::table>, nanoarrow::UniqueSchema, nanoarrow::UniqueArray>
+get_nanoarrow_tables(cudf::size_type length)
+{
+  std::vector<int64_t> int64_data(length);
+  std::vector<bool> bool_data(length);
+  std::vector<std::string> string_data(length);
+  std::vector<uint8_t> validity(length);
+  std::vector<bool> bool_validity(length);
+  std::vector<uint8_t> bool_data_validity;
+  cudf::size_type length_of_individual_list = 3;
+  cudf::size_type length_of_list            = length_of_individual_list * length;
+  std::vector<int64_t> list_int64_data(length_of_list);
+  std::vector<uint8_t> list_int64_data_validity(length_of_list);
+  std::vector<int32_t> list_offsets(length + 1);
+
+  std::vector<std::unique_ptr<cudf::column>> columns;
+
+  columns.emplace_back(cudf::test::fixed_width_column_wrapper<int64_t>(
+                         int64_data.begin(), int64_data.end(), validity.begin())
+                         .release());
+  columns.emplace_back(
+    cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin())
+      .release());
+  auto col4 = cudf::test::fixed_width_column_wrapper<int64_t>(
+    int64_data.begin(), int64_data.end(), validity.begin());
+  auto dict_col = cudf::dictionary::encode(col4);
+  columns.emplace_back(std::move(cudf::dictionary::encode(col4)));
+  columns.emplace_back(cudf::test::fixed_width_column_wrapper<bool>(
+                         bool_data.begin(), bool_data.end(), bool_validity.begin())
+                         .release());
+  auto list_child_column = cudf::test::fixed_width_column_wrapper<int64_t>(
+    list_int64_data.begin(), list_int64_data.end(), list_int64_data_validity.begin());
+  auto list_offsets_column =
+    cudf::test::fixed_width_column_wrapper<int32_t>(list_offsets.begin(), list_offsets.end());
+  auto [list_mask, list_nulls] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper<bool>(
+    bool_data_validity.begin(), bool_data_validity.end()));
+  columns.emplace_back(cudf::make_lists_column(length,
+                                               list_offsets_column.release(),
+                                               list_child_column.release(),
+                                               list_nulls,
+                                               std::move(*list_mask)));
+  auto int_column = cudf::test::fixed_width_column_wrapper<int64_t>(
+                      int64_data.begin(), int64_data.end(), validity.begin())
+                      .release();
+  auto str_column =
+    cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin())
+      .release();
+  vector_of_columns cols;
+  cols.push_back(move(int_column));
+  cols.push_back(move(str_column));
+  auto [null_mask, null_count] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper<bool>(
+    bool_data_validity.begin(), bool_data_validity.end()));
+  columns.emplace_back(
+    cudf::make_structs_column(length, std::move(cols), null_count, std::move(*null_mask)));
+
+  nanoarrow::UniqueSchema schema;
+  ArrowSchemaInit(schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(schema.get(), 6));
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[0], "a"));
+  if (columns[0]->null_count() > 0) {
+    schema->children[0]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[0]->flags = 0;
+  }
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[1], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[1], "b"));
+  if (columns[1]->null_count() > 0) {
+    schema->children[1]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[1]->flags = 0;
+  }
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[2], NANOARROW_TYPE_UINT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(schema->children[2]));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(schema->children[2]->dictionary, NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[2], "c"));
+  if (columns[2]->null_count() > 0) {
+    schema->children[2]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[2]->flags = 0;
+  }
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[3], NANOARROW_TYPE_BOOL));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[3], "d"));
+  if (columns[3]->null_count() > 0) {
+    schema->children[3]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[3]->flags = 0;
+  }
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[4], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(schema->children[4]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[4]->children[0], "element"));
+  if (columns[4]->child(1).null_count() > 0) {
+    schema->children[4]->children[0]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[4]->children[0]->flags = 0;
+  }
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[4], "e"));
+  if (columns[4]->has_nulls()) {
+    schema->children[4]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[4]->flags = 0;
+  }
+
+  ArrowSchemaInit(schema->children[5]);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(schema->children[5], 2));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(schema->children[5]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[5]->children[0], "integral"));
+  if (columns[5]->child(0).has_nulls()) {
+    schema->children[5]->children[0]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[5]->children[0]->flags = 0;
+  }
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(schema->children[5]->children[1], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[5]->children[1], "string"));
+  if (columns[5]->child(1).has_nulls()) {
+    schema->children[5]->children[1]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[5]->children[1]->flags = 0;
+  }
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[5], "f"));
+  if (columns[5]->has_nulls()) {
+    schema->children[5]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[5]->flags = 0;
+  }
+
+  nanoarrow::UniqueArray arrow;
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(arrow.get(), schema.get(), nullptr));
+
+  get_nanoarrow_array<int64_t>(arrow->children[0], int64_data, validity);
+  get_nanoarrow_array<cudf::string_view>(arrow->children[1], string_data, validity);
+  cudf::dictionary_column_view view(dict_col->view());
+  auto keys    = cudf::test::to_host<int64_t>(view.keys()).first;
+  auto indices = cudf::test::to_host<uint32_t>(view.indices()).first;
+  get_nanoarrow_dict_array(arrow->children[2],
+                           std::vector<int64_t>(keys.begin(), keys.end()),
+                           std::vector<int32_t>(indices.begin(), indices.end()),
+                           validity);
+  get_nanoarrow_array<bool>(arrow->children[3], bool_data, bool_validity);
+  get_nanoarrow_list_array<int64_t>(arrow->children[4],
+                                    list_int64_data,
+                                    list_offsets,
+                                    list_int64_data_validity,
+                                    bool_data_validity);
+
+  get_nanoarrow_array<int64_t>(arrow->children[5]->children[0], int64_data, validity);
+  get_nanoarrow_array<cudf::string_view>(arrow->children[5]->children[1], string_data, validity);
+  arrow->children[5]->length = length;
+  NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arrow->children[5]), length));
+  std::for_each(bool_data_validity.begin(), bool_data_validity.end(), [&](auto&& elem) {
+    NANOARROW_THROW_NOT_OK(
+      ArrowBitmapAppend(ArrowArrayValidityBitmap(arrow->children[5]), (elem) ? 1 : 0, 1));
+  });
+  arrow->children[5]->null_count =
+    ArrowBitCountSet(ArrowArrayValidityBitmap(arrow->children[5])->buffer.data, 0, length);
+
+  CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arrow.get(), nullptr) == NANOARROW_OK,
+               "failed to build example Arrays");
+
+  return std::make_tuple(
+    std::make_unique<cudf::table>(std::move(columns)), std::move(schema), std::move(arrow));
+}
+
+struct BaseArrowFixture : public cudf::test::BaseFixture {
+  void compare_schemas(const ArrowSchema* expected, const ArrowSchema* actual)
+  {
+    EXPECT_STREQ(expected->format, actual->format);
+    EXPECT_STREQ(expected->name, actual->name);
+    EXPECT_STREQ(expected->metadata, actual->metadata);
+    EXPECT_EQ(expected->flags, actual->flags);
+    EXPECT_EQ(expected->n_children, actual->n_children);
+
+    if (expected->n_children == 0) {
+      EXPECT_EQ(nullptr, actual->children);
+    } else {
+      for (int i = 0; i < expected->n_children; ++i) {
+        SCOPED_TRACE(expected->children[i]->name);
+        compare_schemas(expected->children[i], actual->children[i]);
+      }
+    }
+
+    if (expected->dictionary != nullptr) {
+      EXPECT_NE(nullptr, actual->dictionary);
+      SCOPED_TRACE("dictionary");
+      compare_schemas(expected->dictionary, actual->dictionary);
+    } else {
+      EXPECT_EQ(nullptr, actual->dictionary);
+    }
+  }
+
+  void compare_device_buffers(const size_t nbytes,
+                              const int buffer_idx,
+                              const ArrowArray* expected,
+                              const ArrowArray* actual)
+  {
+    std::vector<uint8_t> actual_bytes;
+    std::vector<uint8_t> expected_bytes;
+    expected_bytes.resize(nbytes);
+    actual_bytes.resize(nbytes);
+
+    // synchronous copies so we don't have to worry about async weirdness
+    cudaMemcpy(
+      expected_bytes.data(), expected->buffers[buffer_idx], nbytes, cudaMemcpyDeviceToHost);
+    cudaMemcpy(actual_bytes.data(), actual->buffers[buffer_idx], nbytes, cudaMemcpyDeviceToHost);
+
+    ASSERT_EQ(expected_bytes, actual_bytes);
+  }
+
+  void compare_arrays(const ArrowSchema* schema,
+                      const ArrowArray* expected,
+                      const ArrowArray* actual)
+  {
+    ArrowSchemaView schema_view;
+    ArrowSchemaViewInit(&schema_view, schema, nullptr);
+
+    EXPECT_EQ(expected->length, actual->length);
+    EXPECT_EQ(expected->null_count, actual->null_count);
+    EXPECT_EQ(expected->offset, actual->offset);
+    EXPECT_EQ(expected->n_buffers, actual->n_buffers);
+    EXPECT_EQ(expected->n_children, actual->n_children);
+
+    if (expected->length > 0) {
+      EXPECT_EQ(expected->buffers[0], actual->buffers[0]);
+      if (schema_view.type == NANOARROW_TYPE_BOOL) {
+        const size_t nbytes = (expected->length + 7) >> 3;
+        compare_device_buffers(nbytes, 1, expected, actual);
+      } else if (schema_view.type == NANOARROW_TYPE_DECIMAL128) {
+        const size_t nbytes = (expected->length * sizeof(__int128_t));
+        compare_device_buffers(nbytes, 1, expected, actual);
+      } else {
+        for (int i = 1; i < expected->n_buffers; ++i) {
+          EXPECT_EQ(expected->buffers[i], actual->buffers[i]);
+        }
+      }
+    }
+
+    if (expected->n_children == 0) {
+      EXPECT_EQ(nullptr, actual->children);
+    } else {
+      for (int i = 0; i < expected->n_children; ++i) {
+        SCOPED_TRACE(schema->children[i]->name);
+        compare_arrays(schema->children[i], expected->children[i], actual->children[i]);
+      }
+    }
+
+    if (expected->dictionary != nullptr) {
+      EXPECT_NE(nullptr, actual->dictionary);
+      SCOPED_TRACE("dictionary");
+      compare_arrays(schema->dictionary, expected->dictionary, actual->dictionary);
+    } else {
+      EXPECT_EQ(nullptr, actual->dictionary);
+    }
+  }
+};
+
+struct ToArrowDeviceTest : public BaseArrowFixture {};
+
+template <typename T>
+struct ToArrowDeviceTestDurationsTest : public BaseArrowFixture {};
+
+TYPED_TEST_SUITE(ToArrowDeviceTestDurationsTest, cudf::test::DurationTypes);
+
+TEST_F(ToArrowDeviceTest, EmptyTable)
+{
+  const auto [table, schema, arr] = get_nanoarrow_tables(0);
+
+  auto struct_meta          = cudf::column_metadata{"f"};
+  struct_meta.children_meta = {{"integral"}, {"string"}};
+
+  cudf::dictionary_column_view dview{table->view().column(2)};
+
+  std::vector<cudf::column_metadata> meta{{"a"}, {"b"}, {"c"}, {"d"}, {"e"}, struct_meta};
+  auto got_arrow_schema = cudf::to_arrow_schema(table->view(), meta);
+
+  compare_schemas(schema.get(), got_arrow_schema.get());
+  ArrowSchemaRelease(got_arrow_schema.get());
+
+  auto got_arrow_device = cudf::to_arrow_device(std::move(*table));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_device->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_device->device_type);
+
+  compare_arrays(schema.get(), arr.get(), &got_arrow_device->array);
+  ArrowArrayRelease(&got_arrow_device->array);
+}
+
+TEST_F(ToArrowDeviceTest, DateTimeTable)
+{
+  auto data = {1, 2, 3, 4, 5, 6};
+  auto col =
+    cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>(data);
+  std::vector<std::unique_ptr<cudf::column>> cols;
+  cols.emplace_back(col.release());
+  cudf::table input(std::move(cols));
+
+  auto got_arrow_schema =
+    cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
+  nanoarrow::UniqueSchema expected_schema;
+  ArrowSchemaInit(expected_schema.get());
+  ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
+  ArrowSchemaInit(expected_schema->children[0]);
+  ArrowSchemaSetTypeDateTime(
+    expected_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr);
+  ArrowSchemaSetName(expected_schema->children[0], "a");
+  expected_schema->children[0]->flags = 0;
+
+  compare_schemas(expected_schema.get(), got_arrow_schema.get());
+  ArrowSchemaRelease(got_arrow_schema.get());
+
+  auto data_ptr        = input.get_column(0).view().data<int64_t>();
+  auto got_arrow_array = cudf::to_arrow_device(std::move(input));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+
+  EXPECT_EQ(data.size(), got_arrow_array->array.length);
+  EXPECT_EQ(0, got_arrow_array->array.null_count);
+  EXPECT_EQ(0, got_arrow_array->array.offset);
+  EXPECT_EQ(1, got_arrow_array->array.n_children);
+  EXPECT_EQ(nullptr, got_arrow_array->array.buffers[0]);
+
+  EXPECT_EQ(data.size(), got_arrow_array->array.children[0]->length);
+  EXPECT_EQ(0, got_arrow_array->array.children[0]->null_count);
+  EXPECT_EQ(0, got_arrow_array->array.children[0]->offset);
+  EXPECT_EQ(nullptr, got_arrow_array->array.children[0]->buffers[0]);
+  EXPECT_EQ(data_ptr, got_arrow_array->array.children[0]->buffers[1]);
+
+  ArrowArrayRelease(&got_arrow_array->array);
+}
+
+TYPED_TEST(ToArrowDeviceTestDurationsTest, DurationTable)
+{
+  using T = TypeParam;
+
+  if (cudf::type_to_id<TypeParam>() == cudf::type_id::DURATION_DAYS) { return; }
+
+  auto data = {T{1}, T{2}, T{3}, T{4}, T{5}, T{6}};
+  auto col  = cudf::test::fixed_width_column_wrapper<T>(data);
+
+  std::vector<std::unique_ptr<cudf::column>> cols;
+  cols.emplace_back(col.release());
+  cudf::table input(std::move(cols));
+
+  nanoarrow::UniqueSchema expected_schema;
+  ArrowSchemaInit(expected_schema.get());
+  ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
+
+  ArrowSchemaInit(expected_schema->children[0]);
+  const ArrowTimeUnit arrow_unit = [&] {
+    switch (cudf::type_to_id<TypeParam>()) {
+      case cudf::type_id::DURATION_SECONDS: return NANOARROW_TIME_UNIT_SECOND;
+      case cudf::type_id::DURATION_MILLISECONDS: return NANOARROW_TIME_UNIT_MILLI;
+      case cudf::type_id::DURATION_MICROSECONDS: return NANOARROW_TIME_UNIT_MICRO;
+      case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TIME_UNIT_NANO;
+      default: CUDF_FAIL("Unsupported duration unit in arrow");
+    }
+  }();
+  ArrowSchemaSetTypeDateTime(
+    expected_schema->children[0], NANOARROW_TYPE_DURATION, arrow_unit, nullptr);
+  ArrowSchemaSetName(expected_schema->children[0], "a");
+  expected_schema->children[0]->flags = 0;
+
+  auto got_arrow_schema =
+    cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
+  BaseArrowFixture::compare_schemas(expected_schema.get(), got_arrow_schema.get());
+  ArrowSchemaRelease(got_arrow_schema.get());
+
+  auto data_ptr        = input.get_column(0).view().data<int64_t>();
+  auto got_arrow_array = cudf::to_arrow_device(std::move(input));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+
+  EXPECT_EQ(data.size(), got_arrow_array->array.length);
+  EXPECT_EQ(0, got_arrow_array->array.null_count);
+  EXPECT_EQ(0, got_arrow_array->array.offset);
+  EXPECT_EQ(1, got_arrow_array->array.n_children);
+  EXPECT_EQ(nullptr, got_arrow_array->array.buffers[0]);
+
+  EXPECT_EQ(data.size(), got_arrow_array->array.children[0]->length);
+  EXPECT_EQ(0, got_arrow_array->array.children[0]->null_count);
+  EXPECT_EQ(0, got_arrow_array->array.children[0]->offset);
+  EXPECT_EQ(nullptr, got_arrow_array->array.children[0]->buffers[0]);
+  EXPECT_EQ(data_ptr, got_arrow_array->array.children[0]->buffers[1]);
+
+  ArrowArrayRelease(&got_arrow_array->array);
+}
+
+TEST_F(ToArrowDeviceTest, NestedList)
+{
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 != 0; });
+  auto col = cudf::test::lists_column_wrapper<int64_t>(
+    {{{{{1, 2}, valids}, {{3, 4}, valids}, {5}}, {{6}, {{7, 8, 9}, valids}}}, valids});
+
+  std::vector<std::unique_ptr<cudf::column>> cols;
+  cols.emplace_back(col.release());
+  cudf::table input(std::move(cols));
+
+  nanoarrow::UniqueSchema expected_schema;
+  ArrowSchemaInit(expected_schema.get());
+  ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
+
+  ArrowSchemaInitFromType(expected_schema->children[0], NANOARROW_TYPE_LIST);
+  ArrowSchemaSetName(expected_schema->children[0], "a");
+  expected_schema->children[0]->flags = ARROW_FLAG_NULLABLE;
+
+  ArrowSchemaInitFromType(expected_schema->children[0]->children[0], NANOARROW_TYPE_LIST);
+  ArrowSchemaSetName(expected_schema->children[0]->children[0], "element");
+  expected_schema->children[0]->children[0]->flags = 0;
+
+  ArrowSchemaInitFromType(expected_schema->children[0]->children[0]->children[0],
+                          NANOARROW_TYPE_INT64);
+  ArrowSchemaSetName(expected_schema->children[0]->children[0]->children[0], "element");
+  expected_schema->children[0]->children[0]->children[0]->flags = ARROW_FLAG_NULLABLE;
+
+  auto got_arrow_schema =
+    cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
+  compare_schemas(expected_schema.get(), got_arrow_schema.get());
+  ArrowSchemaRelease(got_arrow_schema.get());
+
+  nanoarrow::UniqueArray expected_array;
+  EXPECT_EQ(NANOARROW_OK,
+            ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+  expected_array->length = input.num_rows();
+  auto top_list          = expected_array->children[0];
+  cudf::lists_column_view lview{input.get_column(0).view()};
+  populate_list_from_col(top_list, lview);
+  cudf::lists_column_view nested_view{lview.child()};
+  populate_list_from_col(top_list->children[0], nested_view);
+  populate_from_col<int64_t>(top_list->children[0]->children[0], nested_view.child());
+
+  ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+
+  auto got_arrow_array = cudf::to_arrow_device(std::move(input));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+
+  compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
+  ArrowArrayRelease(&got_arrow_array->array);
+}
+
+TEST_F(ToArrowDeviceTest, StructColumn)
+{
+  // Create cudf table
+  auto nested_type_field_names =
+    std::vector<std::vector<std::string>>{{"string", "integral", "bool", "nested_list", "struct"}};
+  auto str_col =
+    cudf::test::strings_column_wrapper{
+      "Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"}
+      .release();
+  auto str_col2 =
+    cudf::test::strings_column_wrapper{{"CUDF", "ROCKS", "EVERYWHERE"}, {0, 1, 0}}.release();
+  int num_rows{str_col->size()};
+  auto int_col = cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{48, 27, 25}}.release();
+  auto int_col2 =
+    cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{12, 24, 47}, {1, 0, 1}}.release();
+  auto bool_col = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}}.release();
+  auto list_col =
+    cudf::test::lists_column_wrapper<int64_t>({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})
+      .release();
+  vector_of_columns cols2;
+  cols2.push_back(std::move(str_col2));
+  cols2.push_back(std::move(int_col2));
+  auto [null_mask, null_count] =
+    cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}});
+  auto sub_struct_col =
+    cudf::make_structs_column(num_rows, std::move(cols2), null_count, std::move(*null_mask));
+  vector_of_columns cols;
+  cols.push_back(std::move(str_col));
+  cols.push_back(std::move(int_col));
+  cols.push_back(std::move(bool_col));
+  cols.push_back(std::move(list_col));
+  cols.push_back(std::move(sub_struct_col));
+
+  auto struct_col = cudf::make_structs_column(num_rows, std::move(cols), 0, {});
+  std::vector<std::unique_ptr<cudf::column>> table_cols;
+  table_cols.emplace_back(struct_col.release());
+  cudf::table input(std::move(table_cols));
+
+  // Create name metadata
+  auto sub_metadata          = cudf::column_metadata{"struct"};
+  sub_metadata.children_meta = {{"string2"}, {"integral2"}};
+  auto metadata              = cudf::column_metadata{"a"};
+  metadata.children_meta     = {{"string"}, {"integral"}, {"bool"}, {"nested_list"}, sub_metadata};
+
+  nanoarrow::UniqueSchema expected_schema;
+  ArrowSchemaInit(expected_schema.get());
+  ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
+
+  ArrowSchemaInit(expected_schema->children[0]);
+  ArrowSchemaSetTypeStruct(expected_schema->children[0], 5);
+  ArrowSchemaSetName(expected_schema->children[0], "a");
+  expected_schema->children[0]->flags = 0;
+
+  auto child = expected_schema->children[0];
+  ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING);
+  ArrowSchemaSetName(child->children[0], "string");
+  child->children[0]->flags = 0;
+
+  ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32);
+  ArrowSchemaSetName(child->children[1], "integral");
+  child->children[1]->flags = 0;
+
+  ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL);
+  ArrowSchemaSetName(child->children[2], "bool");
+  child->children[2]->flags = 0;
+
+  ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST);
+  ArrowSchemaSetName(child->children[3], "nested_list");
+  child->children[3]->flags = 0;
+  ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST);
+  ArrowSchemaSetName(child->children[3]->children[0], "element");
+  child->children[3]->children[0]->flags = 0;
+  ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64);
+  ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element");
+  child->children[3]->children[0]->children[0]->flags = 0;
+
+  ArrowSchemaInit(child->children[4]);
+  ArrowSchemaSetTypeStruct(child->children[4], 2);
+  ArrowSchemaSetName(child->children[4], "struct");
+
+  ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING);
+  ArrowSchemaSetName(child->children[4]->children[0], "string2");
+  ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32);
+  ArrowSchemaSetName(child->children[4]->children[1], "integral2");
+
+  auto got_arrow_schema =
+    cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{metadata});
+  compare_schemas(expected_schema.get(), got_arrow_schema.get());
+  ArrowSchemaRelease(got_arrow_schema.get());
+
+  nanoarrow::UniqueArray expected_array;
+  ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr);
+
+  expected_array->length = input.num_rows();
+
+  auto array_a        = expected_array->children[0];
+  auto view_a         = input.view().column(0);
+  array_a->length     = view_a.size();
+  array_a->null_count = view_a.null_count();
+
+  ArrowBufferSetAllocator(ArrowArrayBuffer(array_a, 0), noop_alloc);
+  ArrowArrayValidityBitmap(array_a)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_a.null_mask()));
+
+  populate_from_col<cudf::string_view>(array_a->children[0], view_a.child(0));
+  populate_from_col<int32_t>(array_a->children[1], view_a.child(1));
+  populate_from_col<bool>(array_a->children[2], view_a.child(2));
+  populate_list_from_col(array_a->children[3], cudf::lists_column_view{view_a.child(3)});
+  populate_list_from_col(array_a->children[3]->children[0],
+                         cudf::lists_column_view{view_a.child(3).child(1)});
+  populate_from_col<int64_t>(array_a->children[3]->children[0]->children[0],
+                             view_a.child(3).child(1).child(1));
+
+  auto array_struct        = array_a->children[4];
+  auto view_struct         = view_a.child(4);
+  array_struct->length     = view_struct.size();
+  array_struct->null_count = view_struct.null_count();
+
+  ArrowBufferSetAllocator(ArrowArrayBuffer(array_struct, 0), noop_alloc);
+  ArrowArrayValidityBitmap(array_struct)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_struct.null_mask()));
+
+  populate_from_col<cudf::string_view>(array_struct->children[0], view_struct.child(0));
+  populate_from_col<int32_t>(array_struct->children[1], view_struct.child(1));
+
+  ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+
+  auto got_arrow_array = cudf::to_arrow_device(std::move(input));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+
+  compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
+  ArrowArrayRelease(&got_arrow_array->array);
+}
+
+template <typename T>
+using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
+
+TEST_F(ToArrowDeviceTest, FixedPoint64Table)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const expect_data = std::vector<int64_t>{-1, -1, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0};
+    auto col               = fp_wrapper<int64_t>({-1, 2, 3, 4, 5, 6}, scale_type{scale});
+    std::vector<std::unique_ptr<cudf::column>> table_cols;
+    table_cols.emplace_back(col.release());
+    auto input = cudf::table(std::move(table_cols));
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
+    ArrowSchemaInit(expected_schema->children[0]);
+    ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                              NANOARROW_TYPE_DECIMAL128,
+                              cudf::detail::max_precision<int64_t>(),
+                              -scale);
+    ArrowSchemaSetName(expected_schema->children[0], "a");
+    expected_schema->children[0]->flags = 0;
+
+    auto got_arrow_schema =
+      cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
+    compare_schemas(expected_schema.get(), got_arrow_schema.get());
+    ArrowSchemaRelease(got_arrow_schema.get());
+
+    auto result_dev_data = std::make_unique<rmm::device_uvector<int64_t>>(
+      expect_data.size(), cudf::get_default_stream());
+    cudaMemcpy(result_dev_data->data(),
+               expect_data.data(),
+               sizeof(int64_t) * expect_data.size(),
+               cudaMemcpyHostToDevice);
+
+    cudf::get_default_stream().synchronize();
+    nanoarrow::UniqueArray expected_array;
+    ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr);
+    expected_array->length = input.num_rows();
+
+    expected_array->children[0]->length = input.num_rows();
+    ArrowBufferSetAllocator(ArrowArrayBuffer(expected_array->children[0], 0), noop_alloc);
+    ArrowArrayValidityBitmap(expected_array->children[0])->buffer.data =
+      const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(input.view().column(0).null_mask()));
+
+    auto data_ptr = reinterpret_cast<uint8_t*>(result_dev_data->data());
+    ArrowBufferSetAllocator(
+      ArrowArrayBuffer(expected_array->children[0], 1),
+      ArrowBufferDeallocator(
+        [](ArrowBufferAllocator* alloc, uint8_t*, int64_t) {
+          auto buf =
+            reinterpret_cast<std::unique_ptr<rmm::device_uvector<int64_t>>*>(alloc->private_data);
+          delete buf;
+        },
+        new std::unique_ptr<rmm::device_uvector<int64_t>>(std::move(result_dev_data))));
+    ArrowArrayBuffer(expected_array->children[0], 1)->data = data_ptr;
+    ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+
+    auto got_arrow_array = cudf::to_arrow_device(std::move(input));
+    ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+    ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+
+    compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
+    ArrowArrayRelease(&got_arrow_array->array);
+  }
+}
+
+TEST_F(ToArrowDeviceTest, FixedPoint128Table)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const expect_data = std::vector<__int128_t>{-1, 2, 3, 4, 5, 6};
+    auto col               = fp_wrapper<__int128_t>({-1, 2, 3, 4, 5, 6}, scale_type{scale});
+    std::vector<std::unique_ptr<cudf::column>> table_cols;
+    table_cols.emplace_back(col.release());
+    auto input = cudf::table(std::move(table_cols));
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
+    ArrowSchemaInit(expected_schema->children[0]);
+    ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                              NANOARROW_TYPE_DECIMAL128,
+                              cudf::detail::max_precision<__int128_t>(),
+                              -scale);
+    ArrowSchemaSetName(expected_schema->children[0], "a");
+    expected_schema->children[0]->flags = 0;
+
+    auto got_arrow_schema =
+      cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
+    compare_schemas(expected_schema.get(), got_arrow_schema.get());
+    ArrowSchemaRelease(got_arrow_schema.get());
+
+    nanoarrow::UniqueArray expected_array;
+    ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr);
+    expected_array->length = input.num_rows();
+
+    populate_from_col<__int128_t>(expected_array->children[0], input.view().column(0));
+    ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+
+    auto got_arrow_array = cudf::to_arrow_device(std::move(input));
+    EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+    EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+
+    compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
+    ArrowArrayRelease(&got_arrow_array->array);
+  }
+}
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 7afc8fe19bf..b891ff99d47 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -306,6 +306,7 @@ def clean_all_xml_files(path):
 intersphinx_mapping = {
     "cupy": ("https://docs.cupy.dev/en/stable/", None),
     "dlpack": ("https://dmlc.github.io/dlpack/latest/", None),
+    "nanoarrow": ("https://arrow.apache.org/nanoarrow/latest", None),
     "numpy": ("https://numpy.org/doc/stable", None),
     "pandas": ("https://pandas.pydata.org/docs/", None),
     "pyarrow": ("https://arrow.apache.org/docs/", None),

From aab8a76b532b46713b9784302ffd202586ecb5cc Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Tue, 2 Apr 2024 02:14:01 +0200
Subject: [PATCH 018/272] Fixes potential race in JSON parser when parsing JSON
 lines format and when recovering from invalid lines (#15419)

PR adds a missing synchronization before the FST destructor of the FST used for cleaning excess characters following the first valid record on a JSON line.

The problem is that the FST's destructor could otherwise free memory that is yet to be used by the still running FST instance.


Closes https://github.com/rapidsai/cudf/issues/15409

Authors:
  - Elias Stehle (https://github.com/elstehle)

Approvers:
  - Alessandro Bellina (https://github.com/abellina)
  - Shruti Shivakumar (https://github.com/shrshi)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15419
---
 cpp/src/io/json/nested_json_gpu.cu |   3 +
 cpp/tests/io/json_test.cpp         | 107 +++++++++++++++++++++++++++++
 2 files changed, 110 insertions(+)

diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index a6a57c36b08..4ddbe735963 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1583,6 +1583,9 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
                                         thrust::make_discard_iterator(),
                                         fix_stack_of_excess_chars::start_state,
                                         stream);
+
+    // Make sure memory of the FST's lookup tables isn't freed before the FST completes
+    stream.synchronize();
   }
 
   constexpr auto max_translation_table_size =
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 0b70e5e3f93..bae71d3c2a8 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -28,6 +28,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/io/arrow_io_source.hpp>
 #include <cudf/io/json.hpp>
+#include <cudf/io/memory_resource.hpp>
 #include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -35,12 +36,15 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/mr/pinned_host_memory_resource.hpp>
+
 #include <thrust/iterator/constant_iterator.h>
 
 #include <arrow/io/api.h>
 
 #include <fstream>
 #include <limits>
+#include <memory>
 #include <type_traits>
 
 #define wrapper cudf::test::fixed_width_column_wrapper
@@ -2050,6 +2054,109 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringIgnoreExcessChars)
     float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0, 0.0, 0.0, 0.0}, c_validity.cbegin()});
 }
 
+// Sanity test that checks whether there's a race on the FST destructor
+TEST_F(JsonReaderTest, JSONLinesRecoveringSync)
+{
+  // Set up host pinned memory pool to avoid implicit synchronizations to test for any potential
+  // races due to missing host-device synchronizations
+  using host_pooled_mr = rmm::mr::pool_memory_resource<rmm::mr::pinned_host_memory_resource>;
+  host_pooled_mr mr{std::make_shared<rmm::mr::pinned_host_memory_resource>().get(),
+                    size_t{128} * 1024 * 1024};
+
+  // Set new resource
+  auto last_mr = cudf::io::set_host_memory_resource(mr);
+
+  /**
+   * @brief Spark has the specific need to ignore extra characters that come after the first record
+   * on a JSON line
+   */
+  std::string data =
+    // 0 -> a: -2 (valid)
+    R"({"a":-2}{})"
+    "\n"
+    // 1 -> (invalid)
+    R"({"b":{}should_be_invalid})"
+    "\n"
+    // 2 -> b (valid)
+    R"({"b":{"a":3} })"
+    "\n"
+    // 3 -> c: (valid)
+    R"({"c":1.2 } )"
+    "\n"
+    "\n"
+    // 4 -> (valid)
+    R"({"a":4} 123)"
+    "\n"
+    // 5 -> (valid)
+    R"({"a":5}//Comment after record)"
+    "\n"
+    // 6 -> (valid)
+    R"({"a":6} //Comment after whitespace)"
+    "\n"
+    // 7 -> (invalid)
+    R"({"a":5 //Invalid Comment within record})";
+
+  // Create input of a certain size to potentially reveal a missing host/device sync
+  std::size_t const target_size = 40000000;
+  auto const repetitions_log2 =
+    static_cast<std::size_t>(std::ceil(std::log2(target_size / data.size())));
+  auto const repetitions = 1ULL << repetitions_log2;
+
+  for (std::size_t i = 0; i < repetitions_log2; ++i) {
+    data = data + "\n" + data;
+  }
+
+  auto filepath = temp_env->get_temp_dir() + "RecoveringLinesExcessChars.json";
+  {
+    std::ofstream outfile(filepath, std::ofstream::out);
+    outfile << data;
+  }
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
+      .lines(true)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 3);
+  EXPECT_EQ(result.tbl->num_rows(), 8 * repetitions);
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRUCT);
+  EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::FLOAT64);
+
+  std::vector<bool> a_validity{true, false, false, false, true, true, true, false};
+  std::vector<bool> b_validity{false, false, true, false, false, false, false, false};
+  std::vector<bool> c_validity{false, false, false, true, false, false, false, false};
+
+  std::vector<std::int32_t> a_data{-2, 0, 0, 0, 4, 5, 6, 0};
+  std::vector<std::int32_t> b_a_data{0, 0, 3, 0, 0, 0, 0, 0};
+  std::vector<double> c_data{0.0, 0.0, 0.0, 1.2, 0.0, 0.0, 0.0, 0.0};
+
+  for (std::size_t i = 0; i < repetitions_log2; ++i) {
+    a_validity.insert(a_validity.end(), a_validity.cbegin(), a_validity.cend());
+    b_validity.insert(b_validity.end(), b_validity.cbegin(), b_validity.cend());
+    c_validity.insert(c_validity.end(), c_validity.cbegin(), c_validity.cend());
+    a_data.insert(a_data.end(), a_data.cbegin(), a_data.cend());
+    b_a_data.insert(b_a_data.end(), b_a_data.cbegin(), b_a_data.cend());
+    c_data.insert(c_data.end(), c_data.cbegin(), c_data.cend());
+  }
+
+  // Child column b->a
+  auto b_a_col = int64_wrapper(b_a_data.cbegin(), b_a_data.cend());
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(0), int64_wrapper{a_data.cbegin(), a_data.cend(), a_validity.cbegin()});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(1), cudf::test::structs_column_wrapper({b_a_col}, b_validity.cbegin()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(2),
+    float64_wrapper{c_data.cbegin(), c_data.cend(), c_validity.cbegin()});
+
+  // Restore original memory source
+  cudf::io::set_host_memory_resource(last_mr);
+}
+
 TEST_F(JsonReaderTest, MixedTypes)
 {
   using LCWS    = cudf::test::lists_column_wrapper<cudf::string_view>;

From 08ac1eb7832fe99f44b25f192d9931d393a96983 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 2 Apr 2024 08:27:49 -1000
Subject: [PATCH 019/272] Bump ruff and codespell pre-commit checks (#15407)

xref https://github.com/rapidsai/cudf/pull/15345#discussion_r1532379047

Before pursuing migrating isort to ruff, bumping ruff to the latest version

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15407
---
 .pre-commit-config.yaml                              |  4 ++--
 cpp/include/cudf/io/detail/parquet.hpp               |  4 ++--
 cpp/src/copying/contiguous_split.cu                  |  2 +-
 cpp/src/io/orc/aggregate_orc_metadata.cpp            |  2 +-
 pyproject.toml                                       |  8 +++++---
 python/cudf/benchmarks/common/config.py              |  3 ++-
 python/cudf/cudf/_fuzz_testing/utils.py              |  6 +++---
 python/cudf/cudf/core/buffer/buffer.py               |  2 +-
 python/cudf/cudf/core/buffer/spillable_buffer.py     |  2 +-
 python/cudf/cudf/core/column/__init__.py             |  1 -
 python/cudf/cudf/core/column/methods.py              | 12 ++++--------
 python/cudf/cudf/core/column/string.py               |  6 ++----
 python/cudf/cudf/io/parquet.py                       |  6 +++---
 .../cudf/pandas/scripts/analyze-test-failures.py     |  3 ++-
 .../cudf/pandas/scripts/summarize-test-results.py    |  3 ++-
 python/cudf/cudf/tests/test_index.py                 |  1 +
 python/cudf/cudf/tests/test_monotonic.py             |  1 +
 python/cudf/cudf/tests/test_multiindex.py            |  1 +
 python/cudf/cudf/utils/docutils.py                   |  1 +
 python/cudf/cudf/utils/dtypes.py                     |  2 +-
 20 files changed, 36 insertions(+), 34 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 06fdcb9f761..3e99cf3fa9a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -113,7 +113,7 @@ repos:
         pass_filenames: false
         verbose: true
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.2
+    rev: v2.2.6
     hooks:
       - id: codespell
         additional_dependencies: [tomli]
@@ -129,7 +129,7 @@ repos:
       - id: rapids-dependency-file-generator
         args: ["--clean"]
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.13
+    rev: v0.3.4
     hooks:
       - id: ruff
         files: python/.*$
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 0b8ee9676de..df870f6f1e4 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -110,7 +110,7 @@ class chunked_reader : private reader {
    * The chunk_read_limit parameter controls the size of the output chunks produces.  If the user
    * specifies 100 MB of data, the reader will attempt to return chunks containing tables that have
    * a total bytes size (over all columns) of 100 MB or less.  This is a soft limit and the code
-   * will not fail if it cannot satisfy the limit.  It will make a best-effort atttempt only.
+   * will not fail if it cannot satisfy the limit.  It will make a best-effort attempt only.
    *
    * The pass_read_limit parameter controls how much temporary memory is used in the process of
    * decoding the file.  The primary contributor to this memory usage is the uncompressed size of
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 23224d3225d..23bcd344a32 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1139,7 +1139,7 @@ struct packed_src_and_dst_pointers {
 
 /**
  * @brief Create an instance of `packed_src_and_dst_pointers` populating destination
- * partitition buffers (if any) from `out_buffers`. In the chunked_pack case
+ * partition buffers (if any) from `out_buffers`. In the chunked_pack case
  * `out_buffers` is empty, and the destination pointer is provided separately
  * to the `copy_partitions` kernel.
  *
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index f5f540bc3a4..d54524f0f0d 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -194,7 +194,7 @@ aggregate_orc_metadata::select_stripes(
   } else {
     int64_t count            = 0;
     int64_t stripe_skip_rows = 0;
-    // Iterate all source files, each source file has corelating metadata
+    // Iterate all source files, each source file has correlating metadata
     for (size_t src_file_idx = 0;
          src_file_idx < per_file_metadata.size() && count < rows_to_skip + rows_to_read;
          ++src_file_idx) {
diff --git a/pyproject.toml b/pyproject.toml
index 28eac66c1d6..797b5374cb6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,11 +19,14 @@ exclude = [
 skip = "./.git,./.github,./cpp/build,.*egg-info.*,./.mypy_cache,./cpp/tests,./python/cudf/cudf/tests,./java/src/test,./cpp/include/cudf_test/cxxopts.hpp"
 # ignore short words, and typename parameters like OffsetT
 ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b"
-ignore-words-list = "inout,unparseable,falsy"
+ignore-words-list = "inout,unparseable,falsy,couldn,Couldn"
 builtin = "clear"
 quiet-level = 3
 
 [tool.ruff]
+line-length = 79
+
+[tool.ruff.lint]
 select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418"]
 ignore = [
     # whitespace before :
@@ -36,9 +39,8 @@ exclude = [
     # TODO: Remove this in a follow-up where we fix __all__.
     "__init__.py",
 ]
-line-length = 79
 
-[tool.ruff.per-file-ignores]
+[tool.ruff.lint.per-file-ignores]
 # Lots of pytest implicitly injected attributes in conftest-patch.py
 "python/cudf/cudf/pandas/scripts/conftest-patch.py" = ["F821"]
 "python/cudf/cudf/pandas/scripts/*" = ["D"]
diff --git a/python/cudf/benchmarks/common/config.py b/python/cudf/benchmarks/common/config.py
index 305a21d0a29..c1e9d4d6116 100644
--- a/python/cudf/benchmarks/common/config.py
+++ b/python/cudf/benchmarks/common/config.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 """Module used for global configuration of benchmarks.
 
@@ -20,6 +20,7 @@
 in this file and import them in conftest.py to ensure that they are handled
 appropriately.
 """
+
 import os
 import sys
 
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index 6e53195ac2d..d685174f3c2 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -99,9 +99,9 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
                     low=1, high=10
                 )
             else:
-                meta[
-                    "max_types_at_each_level"
-                ] = obj._max_struct_types_at_each_level
+                meta["max_types_at_each_level"] = (
+                    obj._max_struct_types_at_each_level
+                )
 
         elif dtype == "decimal64":
             meta["max_precision"] = cudf.Decimal64Dtype.MAX_PRECISION
diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index 8d278c9c065..1631fa00412 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -181,7 +181,7 @@ def _from_host_memory(cls, data: Any) -> Self:
         Parameters
         ----------
         data : Any
-            An object that represens host memory.
+            An object that represents host memory.
 
         Returns
         -------
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index b25af13679c..a9569190e75 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -154,7 +154,7 @@ def _from_host_memory(cls, data: Any) -> Self:
         Parameters
         ----------
         data : Any
-            An object that represens host memory.
+            An object that represents host memory.
 
         Returns
         -------
diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
index 2a46654ccc2..e7119fcdf47 100644
--- a/python/cudf/cudf/core/column/__init__.py
+++ b/python/cudf/cudf/core/column/__init__.py
@@ -4,7 +4,6 @@
 isort: skip_file
 """
 
-
 from cudf.core.column.categorical import CategoricalColumn
 from cudf.core.column.column import (
     ColumnBase,
diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py
index 0f5a0eb086b..e827c7a3dd3 100644
--- a/python/cudf/cudf/core/column/methods.py
+++ b/python/cudf/cudf/core/column/methods.py
@@ -26,8 +26,7 @@ def _return_or_inplace(
         inplace: Literal[True],
         expand: bool = False,
         retain_index: bool = True,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @overload
     def _return_or_inplace(
@@ -36,8 +35,7 @@ def _return_or_inplace(
         inplace: Literal[False],
         expand: bool = False,
         retain_index: bool = True,
-    ) -> ParentType:
-        ...
+    ) -> ParentType: ...
 
     @overload
     def _return_or_inplace(
@@ -45,8 +43,7 @@ def _return_or_inplace(
         new_col,
         expand: bool = False,
         retain_index: bool = True,
-    ) -> ParentType:
-        ...
+    ) -> ParentType: ...
 
     @overload
     def _return_or_inplace(
@@ -55,8 +52,7 @@ def _return_or_inplace(
         inplace: bool = False,
         expand: bool = False,
         retain_index: bool = True,
-    ) -> Optional[ParentType]:
-        ...
+    ) -> Optional[ParentType]: ...
 
     def _return_or_inplace(
         self, new_col, inplace=False, expand=False, retain_index=True
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index fb76fcdaf39..06d7aa030db 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -257,14 +257,12 @@ def byte_count(self) -> SeriesOrIndex:
     @overload
     def cat(
         self, sep: Optional[str] = None, na_rep: Optional[str] = None
-    ) -> str:
-        ...
+    ) -> str: ...
 
     @overload
     def cat(
         self, others, sep: Optional[str] = None, na_rep: Optional[str] = None
-    ) -> Union[SeriesOrIndex, "cudf.core.column.string.StringColumn"]:
-        ...
+    ) -> Union[SeriesOrIndex, "cudf.core.column.string.StringColumn"]: ...
 
     def cat(self, others=None, sep=None, na_rep=None):
         """
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index bead9c352ef..e55898de675 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -1220,9 +1220,9 @@ def __init__(
     ) -> None:
         if isinstance(path, str) and path.startswith("s3://"):
             self.fs_meta = {"is_s3": True, "actual_path": path}
-            self.dir_: Optional[
-                tempfile.TemporaryDirectory
-            ] = tempfile.TemporaryDirectory()
+            self.dir_: Optional[tempfile.TemporaryDirectory] = (
+                tempfile.TemporaryDirectory()
+            )
             self.path = self.dir_.name
         else:
             self.fs_meta = {}
diff --git a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
index f1744c9e92b..8870fbc5c28 100644
--- a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
+++ b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -11,6 +11,7 @@
 Example:
     python analyze-test-failures.py log.json frame/*
 """
+
 import json
 import sys
 from collections import Counter
diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
index bfc56319d82..ffd2abb960d 100644
--- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py
+++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -10,6 +10,7 @@
     python summarize-test-results.py log.json --output json
     python summarize-test-results.py log.json --output table
 """
+
 import argparse
 import json
 
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 05213d7601c..ebbca57bd40 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -3,6 +3,7 @@
 """
 Test related to Index
 """
+
 import datetime
 import operator
 import re
diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py
index 53919a95115..3c627a5fe89 100644
--- a/python/cudf/cudf/tests/test_monotonic.py
+++ b/python/cudf/cudf/tests/test_monotonic.py
@@ -4,6 +4,7 @@
 Tests related to is_unique, is_monotonic_increasing &
 is_monotonic_decreasing attributes
 """
+
 import numpy as np
 import pandas as pd
 import pytest
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 4926d79e734..76a82afb78e 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -3,6 +3,7 @@
 """
 Test related to MultiIndex
 """
+
 import datetime
 import itertools
 import operator
diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py
index 68447f423a4..4136d97d69f 100644
--- a/python/cudf/cudf/utils/docutils.py
+++ b/python/cudf/cudf/utils/docutils.py
@@ -3,6 +3,7 @@
 """
 Helper functions for parameterized docstring
 """
+
 import functools
 import re
 import string
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index e9dbc23d767..8521239413e 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -587,7 +587,7 @@ def find_common_type(dtypes):
 def _dtype_pandas_compatible(dtype):
     """
     A utility function, that returns `str` instead of `object`
-    dtype when pandas comptibility mode is enabled.
+    dtype when pandas compatibility mode is enabled.
     """
     if cudf.get_option("mode.pandas_compatible") and dtype == cudf.dtype("O"):
         return "str"

From 08d86c92b3e3ccd950e4d63033d44675510cbb74 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 2 Apr 2024 12:29:43 -0700
Subject: [PATCH 020/272] Fix errors in chunked ORC writer when no tables were
 (successfully) written (#15393)

Closes https://github.com/rapidsai/cudf/issues/15386, https://github.com/rapidsai/cudf/issues/15387

The fixes for the two issues overlap, so I included both in a single PR.

Expanded the `_closed` flag to an enum that tracks if the operations in `close()` should be performed (one or more tables were written to the sink). This way, we don't perform the steps in close when there is no valid file to write the footer for.
This includes:

- No `write` calls;
- All `write` calls failed;

The new enum replaces `skip_close()` that used to fix this issue for a smaller subset of cases.

Additionally, writing of the ORC header has been moved after the encode and uses the new state to only write the header in the first `write` call. This way we don't write anything to the sink if there were no `write` calls with the writer, and if the encode failed in the `write`s.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/15393
---
 cpp/include/cudf/io/detail/orc.hpp |  8 -----
 cpp/src/io/functions.cpp           | 11 +-----
 cpp/src/io/orc/writer_impl.cu      | 29 +++++++--------
 cpp/src/io/orc/writer_impl.hpp     | 20 +++++------
 cpp/tests/io/orc_test.cpp          | 58 +++++++++++++++++++++++++++---
 5 files changed, 79 insertions(+), 47 deletions(-)

diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index 3c1486b60c2..c63c952e148 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -124,14 +124,6 @@ class writer {
    * @brief Finishes the chunked/streamed write process.
    */
   void close();
-
-  /**
-   * @brief Skip work done in `close()`; should be called if `write()` failed.
-   *
-   * Calling skip_close() prevents the writer from writing the (invalid) file footer and the
-   * postscript.
-   */
-  void skip_close();
 };
 }  // namespace orc::detail
 }  // namespace cudf::io
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index b8353d312fe..46c6c67c8df 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -436,16 +436,7 @@ void write_orc(orc_writer_options const& options, rmm::cuda_stream_view stream)
 
   auto writer = std::make_unique<orc::detail::writer>(
     std::move(sinks[0]), options, io_detail::single_write_mode::YES, stream);
-  try {
-    writer->write(options.get_table());
-  } catch (...) {
-    // If an exception is thrown, the output is incomplete/corrupted.
-    // Make sure the writer will not close with such corrupted data.
-    // In addition, the writer may throw an exception while trying to close, which would terminate
-    // the process.
-    writer->skip_close();
-    throw;
-  }
+  writer->write(options.get_table());
 }
 
 /**
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index ade0e75de35..750a593920c 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -2438,7 +2438,6 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
   if (options.get_metadata()) {
     _table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
   }
-  init_state();
 }
 
 writer::impl::impl(std::unique_ptr<data_sink> sink,
@@ -2460,20 +2459,13 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
   if (options.get_metadata()) {
     _table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
   }
-  init_state();
 }
 
 writer::impl::~impl() { close(); }
 
-void writer::impl::init_state()
-{
-  // Write file header
-  _out_sink->host_write(MAGIC, std::strlen(MAGIC));
-}
-
 void writer::impl::write(table_view const& input)
 {
-  CUDF_EXPECTS(not _closed, "Data has already been flushed to out and closed");
+  CUDF_EXPECTS(_state != writer_state::CLOSED, "Data has already been flushed to out and closed");
 
   if (not _table_meta) { _table_meta = make_table_meta(input); }
 
@@ -2516,6 +2508,11 @@ void writer::impl::write(table_view const& input)
     }
   }();
 
+  if (_state == writer_state::NO_DATA_WRITTEN) {
+    // Write the ORC file header if this is the first write
+    _out_sink->host_write(MAGIC, std::strlen(MAGIC));
+  }
+
   // Compression/encoding were all successful. Now write the intermediate results.
   write_orc_data_to_sink(enc_data,
                          segmentation,
@@ -2533,6 +2530,8 @@ void writer::impl::write(table_view const& input)
 
   // Update file-level and compression statistics
   update_statistics(orc_table.num_rows(), std::move(intermediate_stats), compression_stats);
+
+  _state = writer_state::DATA_WRITTEN;
 }
 
 void writer::impl::update_statistics(
@@ -2683,8 +2682,11 @@ void writer::impl::add_table_to_footer_data(orc_table_view const& orc_table,
 
 void writer::impl::close()
 {
-  if (_closed) { return; }
-  _closed = true;
+  if (_state != writer_state::DATA_WRITTEN) {
+    // writer is either closed or no data has been written
+    _state = writer_state::CLOSED;
+    return;
+  }
   PostScript ps;
 
   if (_stats_freq != statistics_freq::STATISTICS_NONE) {
@@ -2769,6 +2771,8 @@ void writer::impl::close()
   pbw.put_byte(ps_length);
   _out_sink->host_write(pbw.data(), pbw.size());
   _out_sink->flush();
+
+  _state = writer_state::CLOSED;
 }
 
 // Forward to implementation
@@ -2795,9 +2799,6 @@ writer::~writer() = default;
 // Forward to implementation
 void writer::write(table_view const& table) { _impl->write(table); }
 
-// Forward to implementation
-void writer::skip_close() { _impl->skip_close(); }
-
 // Forward to implementation
 void writer::close() { _impl->close(); }
 
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 417d29efb58..bd082befe0c 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -227,6 +227,14 @@ struct encoded_footer_statistics {
   std::vector<ColStatsBlob> file_level;
 };
 
+enum class writer_state {
+  NO_DATA_WRITTEN,  // No table data has been written to the sink; if the writer is closed or
+                    // destroyed in this state, it should not write the footer.
+  DATA_WRITTEN,     // At least one table has been written to the sink; when the writer is closed,
+                    // it should write the footer.
+  CLOSED            // Writer has been closed; no further writes are allowed.
+};
+
 /**
  * @brief Implementation for ORC writer
  */
@@ -266,11 +274,6 @@ class writer::impl {
    */
   ~impl();
 
-  /**
-   * @brief Begins the chunked/streamed write process.
-   */
-  void init_state();
-
   /**
    * @brief Writes a single subtable as part of a larger ORC file/table write.
    *
@@ -283,11 +286,6 @@ class writer::impl {
    */
   void close();
 
-  /**
-   * @brief Skip writing the footer when closing/deleting the writer.
-   */
-  void skip_close() { _closed = true; }
-
  private:
   /**
    * @brief Write the intermediate ORC data into the data sink.
@@ -363,7 +361,7 @@ class writer::impl {
   Footer _footer;
   Metadata _orc_meta;
   persisted_statistics _persisted_stripe_statistics;  // Statistics data saved between calls.
-  bool _closed = false;  // To track if the output has been written to sink.
+  writer_state _state = writer_state::NO_DATA_WRITTEN;
 };
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 24e2e2cfea0..e108e68e1f9 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -28,6 +28,7 @@
 #include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/io/data_sink.hpp>
 #include <cudf/io/orc.hpp>
 #include <cudf/io/orc_metadata.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -2100,8 +2101,7 @@ TEST_F(OrcWriterTest, BounceBufferBug)
   auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 100; });
 
   constexpr auto num_rows = 150000;
-  column_wrapper<int8_t, typename decltype(sequence)::value_type> col(sequence,
-                                                                      sequence + num_rows);
+  column_wrapper<int8_t> col(sequence, sequence + num_rows);
   table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("BounceBufferBug.orc");
@@ -2120,8 +2120,7 @@ TEST_F(OrcReaderTest, SizeTypeRowsOverflow)
   static_assert(total_rows > std::numeric_limits<cudf::size_type>::max());
 
   auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 127; });
-  column_wrapper<int8_t, typename decltype(sequence)::value_type> col(sequence,
-                                                                      sequence + num_rows);
+  column_wrapper<int8_t> col(sequence, sequence + num_rows);
   table_view chunk_table({col});
 
   std::vector<char> out_buffer;
@@ -2169,4 +2168,55 @@ TEST_F(OrcReaderTest, SizeTypeRowsOverflow)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_with_stripe_selection->view());
 }
 
+TEST_F(OrcChunkedWriterTest, NoWriteCloseNotThrow)
+{
+  std::vector<char> out_buffer;
+
+  cudf::io::chunked_orc_writer_options write_opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&out_buffer});
+  auto writer = cudf::io::orc_chunked_writer(write_opts);
+
+  EXPECT_NO_THROW(writer.close());
+}
+
+TEST_F(OrcChunkedWriterTest, FailedWriteCloseNotThrow)
+{
+  // A sink that throws on write()
+  class throw_sink : public cudf::io::data_sink {
+   public:
+    void host_write(void const* data, size_t size) override { throw std::runtime_error("write"); }
+    void flush() override {}
+    size_t bytes_written() override { return 0; }
+  };
+
+  auto sequence = thrust::make_counting_iterator(0);
+  column_wrapper<int8_t> col(sequence, sequence + 10);
+  table_view table({col});
+
+  throw_sink sink;
+  cudf::io::chunked_orc_writer_options write_opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&sink});
+  auto writer = cudf::io::orc_chunked_writer(write_opts);
+
+  try {
+    writer.write(table);
+  } catch (...) {
+    // ignore the exception; we're testing that close() doesn't throw when the only write() fails
+  }
+
+  EXPECT_NO_THROW(writer.close());
+}
+
+TEST_F(OrcChunkedWriterTest, NoDataInSinkWhenNoWrite)
+{
+  std::vector<char> out_buffer;
+
+  cudf::io::chunked_orc_writer_options write_opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&out_buffer});
+  auto writer = cudf::io::orc_chunked_writer(write_opts);
+
+  EXPECT_NO_THROW(writer.close());
+  EXPECT_EQ(out_buffer.size(), 0);
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 13a5c7be33bec538a9f81872471c29796e67bce5 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 2 Apr 2024 16:54:09 -0400
Subject: [PATCH 021/272] Rework cudf::replace_nulls to use
 strings::detail::copy_if_else (#15286)

Removes the specialized kernels for strings in `cudf::replace_nulls` and replaces them with a call to `cudf::strings::detail::copy_if_else` which is already enabled with offsetalator support and optimized for long strings.
This will also allow `cudf::replace_nulls` to use large strings with no further changes.
Also includes a `replace_nulls` benchmark for strings.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15286
---
 cpp/benchmarks/CMakeLists.txt    |   3 +-
 cpp/benchmarks/replace/nulls.cpp |  59 ++++++++++++++
 cpp/src/replace/nulls.cu         | 127 +++++--------------------------
 3 files changed, 79 insertions(+), 110 deletions(-)
 create mode 100644 cpp/benchmarks/replace/nulls.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index c82e475dece..798e4e76141 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -208,8 +208,9 @@ ConfigureNVBench(
 )
 
 # ##################################################################################################
-# * reduction benchmark ---------------------------------------------------------------------------
+# * replace benchmark ---------------------------------------------------------------------------
 ConfigureBench(REPLACE_BENCH replace/clamp.cpp replace/nans.cpp)
+ConfigureNVBench(REPLACE_NVBENCH replace/nulls.cpp)
 
 # ##################################################################################################
 # * filling benchmark -----------------------------------------------------------------------------
diff --git a/cpp/benchmarks/replace/nulls.cpp b/cpp/benchmarks/replace/nulls.cpp
new file mode 100644
index 00000000000..ccd00050789
--- /dev/null
+++ b/cpp/benchmarks/replace/nulls.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/replace.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void replace_nulls(nvbench::state& state)
+{
+  auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const max_width = static_cast<int32_t>(state.get_int64("row_width"));
+
+  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(max_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const table_profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_width);
+
+  auto const input_table = create_random_table(
+    {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{n_rows}, table_profile);
+  auto const input = input_table->view().column(0);
+  auto const repl  = input_table->view().column(1);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  auto chars_size = cudf::strings_column_view(input).chars_size(cudf::get_default_stream());
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);  // all bytes are read;
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { auto result = cudf::replace_nulls(input, repl); });
+}
+
+NVBENCH_BENCH(replace_nulls)
+  .set_name("replace_nulls")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216});
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 014171f2b40..299cdc6a160 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -32,8 +32,8 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/replace.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/detail/copy_if_else.cuh>
 #include <cudf/strings/detail/replace.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -56,63 +56,6 @@ namespace {  // anonymous
 
 static constexpr int BLOCK_SIZE = 256;
 
-template <int phase, bool replacement_has_nulls>
-CUDF_KERNEL void replace_nulls_strings(cudf::column_device_view input,
-                                       cudf::column_device_view replacement,
-                                       cudf::bitmask_type* output_valid,
-                                       cudf::size_type* offsets,
-                                       char* chars,
-                                       cudf::size_type* valid_counter)
-{
-  cudf::size_type nrows = input.size();
-  auto i                = cudf::detail::grid_1d::global_thread_id();
-  auto const stride     = cudf::detail::grid_1d::grid_stride();
-
-  uint32_t active_mask = 0xffff'ffff;
-  active_mask          = __ballot_sync(active_mask, i < nrows);
-  auto const lane_id{threadIdx.x % cudf::detail::warp_size};
-  uint32_t valid_sum{0};
-
-  while (i < nrows) {
-    bool input_is_valid  = input.is_valid_nocheck(i);
-    bool output_is_valid = true;
-
-    if (replacement_has_nulls && !input_is_valid) {
-      output_is_valid = replacement.is_valid_nocheck(i);
-    }
-
-    cudf::string_view out;
-    if (input_is_valid) {
-      out = input.element<cudf::string_view>(i);
-    } else if (output_is_valid) {
-      out = replacement.element<cudf::string_view>(i);
-    }
-
-    bool nonzero_output = (input_is_valid || output_is_valid);
-
-    if (phase == 0) {
-      offsets[i]       = nonzero_output ? out.size_bytes() : 0;
-      uint32_t bitmask = __ballot_sync(active_mask, output_is_valid);
-      if (0 == lane_id) {
-        output_valid[cudf::word_index(i)] = bitmask;
-        valid_sum += __popc(bitmask);
-      }
-    } else if (phase == 1) {
-      if (nonzero_output) std::memcpy(chars + offsets[i], out.data(), out.size_bytes());
-    }
-
-    i += stride;
-    active_mask = __ballot_sync(active_mask, i < nrows);
-  }
-
-  // Compute total valid count for this block and add it to global count
-  uint32_t block_valid_count = cudf::detail::single_lane_block_sum_reduce<BLOCK_SIZE, 0>(valid_sum);
-  // one thread computes and adds to output_valid_count
-  if (threadIdx.x == 0) {
-    atomicAdd(valid_counter, static_cast<cudf::size_type>(block_valid_count));
-  }
-}
-
 template <typename Type, bool replacement_has_nulls>
 CUDF_KERNEL void replace_nulls(cudf::column_device_view input,
                                cudf::column_device_view replacement,
@@ -222,58 +165,24 @@ std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  rmm::device_scalar<cudf::size_type> valid_counter(0, stream);
-  cudf::size_type* valid_count = valid_counter.data();
-
-  auto replace_first  = replace_nulls_strings<0, false>;
-  auto replace_second = replace_nulls_strings<1, false>;
-  if (replacement.has_nulls()) {
-    replace_first  = replace_nulls_strings<0, true>;
-    replace_second = replace_nulls_strings<1, true>;
+  auto d_input       = cudf::column_device_view::create(input, stream);
+  auto d_replacement = cudf::column_device_view::create(replacement, stream);
+
+  auto lhs_iter =
+    cudf::detail::make_optional_iterator<cudf::string_view>(*d_input, cudf::nullate::YES{});
+  auto rhs_iter = cudf::detail::make_optional_iterator<cudf::string_view>(
+    *d_replacement, cudf::nullate::DYNAMIC{replacement.nullable()});
+
+  auto filter = cudf::detail::validity_accessor<false>{*d_input};
+  auto result = cudf::strings::detail::copy_if_else(
+    lhs_iter, lhs_iter + input.size(), rhs_iter, filter, stream, mr);
+
+  // input is nullable so result should always be nullable here
+  if (!result->nullable()) {
+    result->set_null_mask(
+      cudf::detail::create_null_mask(input.size(), cudf::mask_state::ALL_VALID, stream, mr), 0);
   }
-
-  // Create new offsets column to use in kernel
-  std::unique_ptr<cudf::column> sizes = cudf::make_numeric_column(
-    cudf::data_type(cudf::type_id::INT32), input.size(), cudf::mask_state::UNALLOCATED, stream);
-
-  auto sizes_view         = sizes->mutable_view();
-  auto device_in          = cudf::column_device_view::create(input, stream);
-  auto device_replacement = cudf::column_device_view::create(replacement, stream);
-
-  rmm::device_buffer valid_bits =
-    cudf::detail::create_null_mask(input.size(), cudf::mask_state::UNINITIALIZED, stream, mr);
-
-  // Call first pass kernel to get sizes in offsets
-  cudf::detail::grid_1d grid{input.size(), BLOCK_SIZE, 1};
-  replace_first<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(
-    *device_in,
-    *device_replacement,
-    reinterpret_cast<cudf::bitmask_type*>(valid_bits.data()),
-    sizes_view.begin<cudf::size_type>(),
-    nullptr,
-    valid_count);
-
-  auto [offsets, bytes] = cudf::detail::make_offsets_child_column(
-    sizes_view.begin<int32_t>(), sizes_view.end<int32_t>(), stream, mr);
-
-  auto offsets_view = offsets->mutable_view();
-
-  // Allocate chars array and output null mask
-  rmm::device_uvector<char> output_chars(bytes, stream, mr);
-
-  replace_second<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(
-    *device_in,
-    *device_replacement,
-    reinterpret_cast<cudf::bitmask_type*>(valid_bits.data()),
-    offsets_view.begin<cudf::size_type>(),
-    output_chars.data(),
-    valid_count);
-
-  return cudf::make_strings_column(input.size(),
-                                   std::move(offsets),
-                                   output_chars.release(),
-                                   input.size() - valid_counter.value(stream),
-                                   std::move(valid_bits));
+  return result;
 }
 
 template <>

From 2584fd9d1e1fffb2aefd0417ba0994d7a563e076 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 2 Apr 2024 16:39:46 -0700
Subject: [PATCH 022/272] Test static builds in CI and fix nanoarrow configure
 (#15437)

Resolves #15275
Resolves #15434

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)
  - Robert Maynard (https://github.com/robertmaynard)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15437
---
 .github/workflows/pr.yaml                     |  11 ++
 .github/workflows/test.yaml                   |  10 ++
 ci/configure_cpp_static.sh                    |  23 +++
 cpp/cmake/thirdparty/get_nanoarrow.cmake      |  20 +++
 .../thirdparty/patches/nanoarrow_cmake.diff   | 161 ++++++++++++++++++
 dependencies.yaml                             |  18 +-
 6 files changed, 239 insertions(+), 4 deletions(-)
 create mode 100755 ci/configure_cpp_static.sh
 create mode 100644 cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 303988212d3..2d7ebb62fa8 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -20,6 +20,7 @@ jobs:
       - conda-python-cudf-tests
       - conda-python-other-tests
       - conda-java-tests
+      - static-configure
       - conda-notebook-tests
       - docs-build
       - wheel-build-cudf
@@ -88,6 +89,16 @@ jobs:
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_java.sh"
+  static-configure:
+    needs: checks
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    with:
+      build_type: pull-request
+      # Use the wheel container so we can skip conda solves and since our
+      # primary static consumers (Spark) are not in conda anyway.
+      container_image: "rapidsai/ci-wheel:latest"
+      run_script: "ci/configure_cpp_static.sh"
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 6f7aef79881..ea47b6ad466 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -43,6 +43,16 @@ jobs:
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_cpp_memcheck.sh"
+  static-configure:
+    needs: checks
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    with:
+      build_type: pull-request
+      # Use the wheel container so we can skip conda solves and since our
+      # primary static consumers (Spark) are not in conda anyway.
+      container_image: "rapidsai/ci-wheel:latest"
+      run_script: "ci/configure_cpp_static.sh"
   conda-python-cudf-tests:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
diff --git a/ci/configure_cpp_static.sh b/ci/configure_cpp_static.sh
new file mode 100755
index 00000000000..675e0c3981f
--- /dev/null
+++ b/ci/configure_cpp_static.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+rapids-configure-conda-channels
+
+source rapids-date-string
+
+rapids-logger "Configure static cpp build"
+
+ENV_YAML_DIR="$(mktemp -d)"
+REQUIREMENTS_FILE="${ENV_YAML_DIR}/requirements.txt"
+
+rapids-dependency-file-generator \
+  --output requirements \
+  --file_key test_static_build \
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${REQUIREMENTS_FILE}"
+
+python -m pip install -r "${REQUIREMENTS_FILE}"
+pyenv rehash
+
+cmake -S cpp -B build_static -GNinja -DBUILD_SHARED_LIBS=OFF -DBUILD_TESTS=OFF
diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake
index be938a89ccd..4316db99a8d 100644
--- a/cpp/cmake/thirdparty/get_nanoarrow.cmake
+++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake
@@ -17,6 +17,25 @@ function(find_and_configure_nanoarrow)
   set(oneValueArgs VERSION FORK PINNED_TAG)
   cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
+  # Only run if PKG_VERSION is < 0.5.0
+  if(PKG_VERSION VERSION_LESS 0.5.0)
+    set(patch_files_to_run "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches/nanoarrow_cmake.diff")
+    set(patch_issues_to_ref
+        "Fix issues with nanoarrow CMake [https://github.com/apache/arrow-nanoarrow/pull/406]"
+    )
+    set(patch_script "${CMAKE_BINARY_DIR}/rapids-cmake/patches/nanoarrow/patch.cmake")
+    set(log_file "${CMAKE_BINARY_DIR}/rapids-cmake/patches/nanoarrow/log")
+    string(TIMESTAMP current_year "%Y" UTC)
+    configure_file(
+      ${rapids-cmake-dir}/cpm/patches/command_template.cmake.in "${patch_script}" @ONLY
+    )
+  else()
+    message(
+      FATAL_ERROR
+        "Nanoarrow version ${PKG_VERSION} already contains the necessary patch. Please remove this patch from cudf."
+    )
+  endif()
+
   rapids_cpm_find(
     nanoarrow ${PKG_VERSION}
     GLOBAL_TARGETS nanoarrow
@@ -26,6 +45,7 @@ function(find_and_configure_nanoarrow)
     # TODO: Commit hashes are not supported with shallow clones. Can switch this if and when we pin
     # to an actual tag.
     GIT_SHALLOW FALSE
+    PATCH_COMMAND ${CMAKE_COMMAND} -P ${patch_script}
     OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf"
   )
   set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff b/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff
new file mode 100644
index 00000000000..b53e134ed2c
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff
@@ -0,0 +1,161 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 8714c70..1feec13 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -49,7 +49,6 @@ else()
+ endif()
+
+ option(NANOARROW_CODE_COVERAGE "Enable coverage reporting" OFF)
+-add_library(coverage_config INTERFACE)
+
+ # Avoids a warning about timestamps on downloaded files (prefer new policy
+ # if available))
+@@ -111,6 +110,8 @@ if(NANOARROW_BUNDLE)
+   if(NANOARROW_BUILD_TESTS)
+     include_directories(${CMAKE_BINARY_DIR}/amalgamation)
+     add_library(nanoarrow ${NANOARROW_C_TEMP})
++    add_library(nanoarrow::nanoarrow ALIAS nanoarrow)
++
+     target_compile_definitions(nanoarrow PUBLIC "$<$<CONFIG:Debug>:NANOARROW_DEBUG>")
+   endif()
+
+@@ -120,6 +121,7 @@ if(NANOARROW_BUNDLE)
+ else()
+   add_library(nanoarrow src/nanoarrow/array.c src/nanoarrow/schema.c
+                         src/nanoarrow/array_stream.c src/nanoarrow/utils.c)
++  add_library(nanoarrow::nanoarrow ALIAS nanoarrow)
+
+   target_include_directories(nanoarrow
+                              PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src>
+@@ -154,13 +156,50 @@ else()
+     endif()
+   endif()
+
+-  install(TARGETS nanoarrow DESTINATION lib)
++  install(TARGETS nanoarrow
++          DESTINATION lib
++          EXPORT nanoarrow-exports)
+   install(DIRECTORY src/
+           DESTINATION include
+           FILES_MATCHING
+-          PATTERN "*.h")
++          PATTERN "*.h*")
+   install(FILES ${CMAKE_CURRENT_BINARY_DIR}/generated/nanoarrow_config.h
+           DESTINATION include/nanoarrow)
++
++  # Generate package files for the build and install trees.
++  include(CMakePackageConfigHelpers)
++  include(GNUInstallDirs)
++
++  foreach(tree_type BUILD INSTALL)
++    if(tree_type STREQUAL "BUILD")
++      set(install_location ".")
++    else()
++      set(install_location "${CMAKE_INSTALL_LIBDIR}/cmake/nanoarrow")
++    endif()
++
++    set(build_location "${PROJECT_BINARY_DIR}/${install_location}")
++    write_basic_package_version_file(
++      "${build_location}/nanoarrow-config-version.cmake"
++      VERSION ${nanoarrow_VERSION}
++      # After 1.0.0, we can use `SameMajorVersion` here.
++      COMPATIBILITY ExactVersion)
++    configure_package_config_file("${CMAKE_CURRENT_LIST_DIR}/cmake/config.cmake.in"
++                                  "${build_location}/nanoarrow-config.cmake"
++                                  INSTALL_DESTINATION "${install_location}")
++
++    if(tree_type STREQUAL "BUILD")
++      export(EXPORT nanoarrow-exports
++             FILE "${build_location}/nanoarrow-targets.cmake"
++             NAMESPACE nanoarrow::)
++
++    else()
++      install(DIRECTORY "${build_location}/" DESTINATION "${install_location}")
++      install(EXPORT nanoarrow-exports
++              DESTINATION "${install_location}"
++              FILE "nanoarrow-targets.cmake"
++              NAMESPACE nanoarrow::)
++    endif()
++  endforeach()
+ endif()
+
+ # Always build integration test if building tests
+@@ -215,34 +254,18 @@ if(NANOARROW_BUILD_TESTS)
+                  src/nanoarrow/integration/c_data_integration_test.cc)
+
+   if(NANOARROW_CODE_COVERAGE)
+-    target_compile_options(coverage_config INTERFACE -O0 -g --coverage)
+-    target_link_options(coverage_config INTERFACE --coverage)
+-    target_link_libraries(nanoarrow coverage_config)
++    target_compile_options(nanoarrow PUBLIC -O0 -g --coverage)
++    target_link_options(nanoarrow PUBLIC --coverage)
+   endif()
+
+-  target_link_libraries(utils_test
+-                        nanoarrow
+-                        gtest_main
+-                        ${NANOARROW_ARROW_TARGET}
+-                        coverage_config)
+-  target_link_libraries(buffer_test nanoarrow gtest_main coverage_config)
+-  target_link_libraries(array_test
+-                        nanoarrow
+-                        gtest_main
+-                        ${NANOARROW_ARROW_TARGET}
+-                        coverage_config)
+-  target_link_libraries(schema_test
+-                        nanoarrow
+-                        gtest_main
+-                        ${NANOARROW_ARROW_TARGET}
+-                        coverage_config)
+-  target_link_libraries(array_stream_test nanoarrow gtest_main coverage_config)
+-  target_link_libraries(nanoarrow_hpp_test nanoarrow gtest_main coverage_config)
+-  target_link_libraries(nanoarrow_testing_test
+-                        nanoarrow
+-                        gtest_main
+-                        nlohmann_json::nlohmann_json
+-                        coverage_config)
++  target_link_libraries(utils_test nanoarrow gtest_main ${NANOARROW_ARROW_TARGET})
++  target_link_libraries(buffer_test nanoarrow gtest_main)
++  target_link_libraries(array_test nanoarrow gtest_main ${NANOARROW_ARROW_TARGET})
++  target_link_libraries(schema_test nanoarrow gtest_main ${NANOARROW_ARROW_TARGET})
++  target_link_libraries(array_stream_test nanoarrow gtest_main)
++  target_link_libraries(nanoarrow_hpp_test nanoarrow gtest_main)
++  target_link_libraries(nanoarrow_testing_test nanoarrow gtest_main
++                        nlohmann_json::nlohmann_json)
+   target_link_libraries(c_data_integration_test nanoarrow nanoarrow_c_data_integration
+                         gtest_main)
+
+diff --git a/cmake/config.cmake.in b/cmake/config.cmake.in
+new file mode 100644
+index 0000000..021dc31
+--- /dev/null
++++ b/cmake/config.cmake.in
+@@ -0,0 +1,28 @@
++# Licensed to the Apache Software Foundation (ASF) under one
++# or more contributor license agreements.  See the NOTICE file
++# distributed with this work for additional information
++# regarding copyright ownership.  The ASF licenses this file
++# to you under the Apache License, Version 2.0 (the
++# "License"); you may not use this file except in compliance
++# with the License.  You may obtain a copy of the License at
++#
++# http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing,
++# software distributed under the License is distributed on an
++# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
++# KIND, either express or implied.  See the License for the
++# specific language governing permissions and limitations
++# under the License.
++
++
++@PACKAGE_INIT@
++
++cmake_minimum_required(VERSION @CMAKE_MINIMUM_REQUIRED_VERSION@)
++
++include("${CMAKE_CURRENT_LIST_DIR}/nanoarrow-targets.cmake" REQUIRED)
++include("${CMAKE_CURRENT_LIST_DIR}/nanoarrow-config-version.cmake" REQUIRED)
++
++set(${CMAKE_FIND_PACKAGE_NAME}_CONFIG "${CMAKE_CURRENT_LIST_FILE}")
++include(FindPackageHandleStandardArgs)
++find_package_handle_standard_args(${CMAKE_FIND_PACKAGE_NAME} CONFIG_MODE)
diff --git a/dependencies.yaml b/dependencies.yaml
index 85f5a86d938..5bb555df818 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -6,6 +6,7 @@ files:
       cuda: ["11.8", "12.2"]
       arch: [x86_64]
     includes:
+      - build_base
       - build_all
       - build_cpp
       - build_wheels
@@ -27,6 +28,10 @@ files:
       - test_python_cudf
       - test_python_dask_cudf
       - depends_on_cupy
+  test_static_build:
+    output: none
+    includes:
+      - build_base
   test_cpp:
     output: none
     includes:
@@ -45,6 +50,7 @@ files:
   test_java:
     output: none
     includes:
+      - build_base
       - build_all
       - cuda
       - cuda_version
@@ -75,6 +81,7 @@ files:
     extras:
       table: build-system
     includes:
+      - build_base
       - build_python_common
       - build_python_cudf
   py_run_cudf:
@@ -144,6 +151,7 @@ files:
     extras:
       table: build-system
     includes:
+      - build_base
       - build_python_common
   py_run_cudf_kafka:
     output: pyproject
@@ -191,12 +199,16 @@ channels:
   - conda-forge
   - nvidia
 dependencies:
-  build_all:
+  build_base:
     common:
-      - output_types: conda
+      - output_types: [conda, requirements, pyproject]
         packages:
           - &cmake_ver cmake>=3.26.4
           - &ninja ninja
+  build_all:
+    common:
+      - output_types: conda
+        packages:
           - c-compiler
           - cxx-compiler
           - dlpack>=0.8,<1.0
@@ -254,9 +266,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - *cmake_ver
           - cython>=3.0.3
-          - *ninja
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
           - pyarrow==14.0.2.*

From 082f6c91eb3906dbdf785348160ad5631ec91458 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 3 Apr 2024 11:27:47 -0400
Subject: [PATCH 023/272] Use offsetalator in cudf::strings::replace functions
 (#14824)

Adds offsetalator in place of hardcoded offset size_type arrays to the strings replace functions.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14824
---
 cpp/src/strings/replace/multi.cu         | 236 +++----
 cpp/src/strings/replace/replace.cu       | 791 +++++++++--------------
 cpp/src/strings/replace/replace_nulls.cu |  12 +-
 cpp/src/strings/replace/replace_slice.cu |  25 +-
 4 files changed, 463 insertions(+), 601 deletions(-)

diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 8b5a4317b50..c93add01f69 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -14,13 +14,14 @@
  * limitations under the License.
  */
 
+#include "strings/split/split.cuh"
+
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/strings/detail/char_tables.hpp>
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
@@ -42,6 +43,7 @@
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
 #include <thrust/optional.h>
 #include <thrust/scan.h>
 #include <thrust/transform.h>
@@ -67,7 +69,7 @@ constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 256;
  * @brief Type used for holding the target position (first) and the
  * target index (second).
  */
-using target_pair = thrust::pair<size_type, size_type>;
+using target_pair = thrust::tuple<int64_t, size_type>;
 
 /**
  * @brief Helper functions for performing character-parallel replace
@@ -75,12 +77,6 @@ using target_pair = thrust::pair<size_type, size_type>;
 struct replace_multi_parallel_fn {
   __device__ char const* get_base_ptr() const { return d_strings.head<char>(); }
 
-  __device__ size_type const* get_offsets_ptr() const
-  {
-    return d_strings.child(strings_column_view::offsets_column_index).data<size_type>() +
-           d_strings.offset();
-  }
-
   __device__ string_view const get_string(size_type idx) const
   {
     return d_strings.element<string_view>(idx);
@@ -100,11 +96,12 @@ struct replace_multi_parallel_fn {
    * @param idx Index of the byte position in the chars column
    * @param chars_bytes Number of bytes in the chars column
    */
-  __device__ thrust::optional<size_type> has_target(size_type idx, size_type chars_bytes) const
+  __device__ size_type target_index(int64_t idx, int64_t chars_bytes) const
   {
-    auto const d_offsets = get_offsets_ptr();
+    auto const d_offsets = d_strings_offsets;
     auto const d_chars   = get_base_ptr() + d_offsets[0] + idx;
     size_type str_idx    = -1;
+    string_view d_str{};
     for (std::size_t t = 0; t < d_targets.size(); ++t) {
       auto const d_tgt = d_targets[t];
       if (!d_tgt.empty() && (idx + d_tgt.size_bytes() <= chars_bytes) &&
@@ -113,12 +110,24 @@ struct replace_multi_parallel_fn {
           auto const idx_itr =
             thrust::upper_bound(thrust::seq, d_offsets, d_offsets + d_strings.size(), idx);
           str_idx = thrust::distance(d_offsets, idx_itr) - 1;
+          d_str   = get_string(str_idx - d_offsets[0]);
         }
-        auto const d_str = get_string(str_idx - d_offsets[0]);
         if ((d_chars + d_tgt.size_bytes()) <= (d_str.data() + d_str.size_bytes())) { return t; }
       }
     }
-    return thrust::nullopt;
+    return -1;
+  }
+
+  __device__ bool has_target(int64_t idx, int64_t chars_bytes) const
+  {
+    auto const d_chars = get_base_ptr() + d_strings_offsets[0] + idx;
+    for (auto& d_tgt : d_targets) {
+      if (!d_tgt.empty() && (idx + d_tgt.size_bytes() <= chars_bytes) &&
+          (d_tgt.compare(d_chars, d_tgt.size_bytes()) == 0)) {
+        return true;
+      }
+    }
+    return false;
   }
 
   /**
@@ -133,28 +142,32 @@ struct replace_multi_parallel_fn {
    * @return Number of substrings resulting from the replace operations on this row
    */
   __device__ size_type count_strings(size_type idx,
-                                     target_pair const* d_positions,
-                                     size_type const* d_targets_offsets) const
+                                     int64_t const* d_positions,
+                                     size_type const* d_indices,
+                                     cudf::detail::input_offsetalator d_targets_offsets) const
   {
     if (!is_valid(idx)) { return 0; }
 
-    auto const d_str             = get_string(idx);
-    auto const d_str_end         = d_str.data() + d_str.size_bytes();
-    auto const base_ptr          = get_base_ptr();
-    auto const targets_positions = cudf::device_span<target_pair const>(
-      d_positions + d_targets_offsets[idx], d_targets_offsets[idx + 1] - d_targets_offsets[idx]);
+    auto const d_str     = get_string(idx);
+    auto const d_str_end = d_str.data() + d_str.size_bytes();
+    auto const base_ptr  = get_base_ptr();
+
+    auto const target_offset = d_targets_offsets[idx];
+    auto const targets_size  = static_cast<size_type>(d_targets_offsets[idx + 1] - target_offset);
+    auto const positions     = d_positions + target_offset;
+    auto const indices       = d_indices + target_offset;
 
     size_type count = 1;  // always at least one string
     auto str_ptr    = d_str.data();
-    for (auto d_pair : targets_positions) {
-      auto const d_pos   = d_pair.first;
-      auto const d_tgt   = d_targets[d_pair.second];
-      auto const tgt_ptr = base_ptr + d_pos;
+    for (std::size_t i = 0; i < targets_size; ++i) {
+      auto const tgt_idx = indices[i];
+      auto const d_tgt   = d_targets[tgt_idx];
+      auto const tgt_ptr = base_ptr + positions[i];
       if (str_ptr <= tgt_ptr && tgt_ptr < d_str_end) {
         auto const keep_size = static_cast<size_type>(thrust::distance(str_ptr, tgt_ptr));
         if (keep_size > 0) { count++; }  // don't bother counting empty strings
 
-        auto const d_repl = get_replacement_string(d_pair.second);
+        auto const d_repl = get_replacement_string(tgt_idx);
         if (!d_repl.empty()) { count++; }
 
         str_ptr += keep_size + d_tgt.size_bytes();
@@ -182,9 +195,10 @@ struct replace_multi_parallel_fn {
    * @return The size in bytes of the output string for this row
    */
   __device__ size_type get_strings(size_type idx,
-                                   size_type const* d_offsets,
-                                   target_pair const* d_positions,
-                                   size_type const* d_targets_offsets,
+                                   cudf::detail::input_offsetalator const d_offsets,
+                                   int64_t const* d_positions,
+                                   size_type const* d_indices,
+                                   cudf::detail::input_offsetalator d_targets_offsets,
                                    string_index_pair* d_all_strings) const
   {
     if (!is_valid(idx)) { return 0; }
@@ -194,22 +208,24 @@ struct replace_multi_parallel_fn {
     auto const d_str_end = d_str.data() + d_str.size_bytes();
     auto const base_ptr  = get_base_ptr();
 
-    auto const targets_positions = cudf::device_span<target_pair const>(
-      d_positions + d_targets_offsets[idx], d_targets_offsets[idx + 1] - d_targets_offsets[idx]);
+    auto const target_offset = d_targets_offsets[idx];
+    auto const targets_size  = static_cast<size_type>(d_targets_offsets[idx + 1] - target_offset);
+    auto const positions     = d_positions + target_offset;
+    auto const indices       = d_indices + target_offset;
 
     size_type output_idx  = 0;
     size_type output_size = 0;
     auto str_ptr          = d_str.data();
-    for (auto d_pair : targets_positions) {
-      auto const d_pos   = d_pair.first;
-      auto const d_tgt   = d_targets[d_pair.second];
-      auto const tgt_ptr = base_ptr + d_pos;
+    for (std::size_t i = 0; i < targets_size; ++i) {
+      auto const tgt_idx = indices[i];
+      auto const d_tgt   = d_targets[tgt_idx];
+      auto const tgt_ptr = base_ptr + positions[i];
       if (str_ptr <= tgt_ptr && tgt_ptr < d_str_end) {
         auto const keep_size = static_cast<size_type>(thrust::distance(str_ptr, tgt_ptr));
         if (keep_size > 0) { d_output[output_idx++] = string_index_pair{str_ptr, keep_size}; }
         output_size += keep_size;
 
-        auto const d_repl = get_replacement_string(d_pair.second);
+        auto const d_repl = get_replacement_string(tgt_idx);
         if (!d_repl.empty()) {
           d_output[output_idx++] = string_index_pair{d_repl.data(), d_repl.size_bytes()};
         }
@@ -228,14 +244,19 @@ struct replace_multi_parallel_fn {
   }
 
   replace_multi_parallel_fn(column_device_view const& d_strings,
+                            cudf::detail::input_offsetalator d_strings_offsets,
                             device_span<string_view const> d_targets,
                             device_span<string_view const> d_replacements)
-    : d_strings(d_strings), d_targets{d_targets}, d_replacements{d_replacements}
+    : d_strings(d_strings),
+      d_strings_offsets(d_strings_offsets),
+      d_targets{d_targets},
+      d_replacements{d_replacements}
   {
   }
 
  protected:
   column_device_view d_strings;
+  cudf::detail::input_offsetalator d_strings_offsets;
   device_span<string_view const> d_targets;
   device_span<string_view const> d_replacements;
 };
@@ -247,17 +268,16 @@ struct replace_multi_parallel_fn {
  * (this happens sometimes when passing device lambdas to thrust algorithms)
  */
 struct pair_generator {
-  __device__ target_pair operator()(int idx) const
+  __device__ target_pair operator()(int64_t idx) const
   {
-    auto pos = fn.has_target(idx, chars_bytes);
-    return target_pair{idx, pos.value_or(-1)};
+    return thrust::make_tuple(idx, fn.target_index(idx, chars_bytes));
   }
   replace_multi_parallel_fn fn;
-  size_type chars_bytes;
+  int64_t chars_bytes;
 };
 
 struct copy_if_fn {
-  __device__ bool operator()(target_pair pos) { return pos.second >= 0; }
+  __device__ bool operator()(target_pair pos) { return thrust::get<1>(pos) >= 0; }
 };
 
 std::unique_ptr<column> replace_character_parallel(strings_column_view const& input,
@@ -270,105 +290,91 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
 
   auto const strings_count = input.size();
   auto const chars_bytes =
-    cudf::detail::get_value<size_type>(input.offsets(), input.offset() + strings_count, stream) -
-    cudf::detail::get_value<size_type>(input.offsets(), input.offset(), stream);
+    get_offset_value(input.offsets(), input.offset() + strings_count, stream) -
+    get_offset_value(input.offsets(), input.offset(), stream);
 
   auto d_targets =
     create_string_vector_from_column(targets, stream, rmm::mr::get_current_device_resource());
   auto d_replacements =
     create_string_vector_from_column(repls, stream, rmm::mr::get_current_device_resource());
 
-  replace_multi_parallel_fn fn{*d_strings, d_targets, d_replacements};
+  replace_multi_parallel_fn fn{
+    *d_strings,
+    cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()),
+    d_targets,
+    d_replacements,
+  };
+
+  // Count the number of targets in the entire column.
+  // Note this may over-count in the case where a target spans adjacent strings.
+  auto target_count = thrust::count_if(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_counting_iterator<int64_t>(0),
+    thrust::make_counting_iterator<int64_t>(chars_bytes),
+    [fn, chars_bytes] __device__(int64_t idx) { return fn.has_target(idx, chars_bytes); });
 
-  // count the number of targets in the entire column
-  auto const target_count = thrust::count_if(rmm::exec_policy(stream),
-                                             thrust::make_counting_iterator<size_type>(0),
-                                             thrust::make_counting_iterator<size_type>(chars_bytes),
-                                             [fn, chars_bytes] __device__(size_type idx) {
-                                               return fn.has_target(idx, chars_bytes).has_value();
-                                             });
   // Create a vector of every target position in the chars column.
-  // These may include overlapping targets which will be resolved later.
-  auto targets_positions = rmm::device_uvector<target_pair>(target_count, stream);
+  // These may also include overlapping targets which will be resolved later.
+  auto targets_positions = rmm::device_uvector<int64_t>(target_count, stream);
+  auto targets_indices   = rmm::device_uvector<size_type>(target_count, stream);
+
+  // cudf::detail::make_counting_transform_iterator hardcodes size_type
+  auto const copy_itr = thrust::make_transform_iterator(thrust::counting_iterator<int64_t>(0),
+                                                        pair_generator{fn, chars_bytes});
+  auto const out_itr  = thrust::make_zip_iterator(
+    thrust::make_tuple(targets_positions.begin(), targets_indices.begin()));
+  auto const copy_end =
+    cudf::detail::copy_if_safe(copy_itr, copy_itr + chars_bytes, out_itr, copy_if_fn{}, stream);
+
+  // adjust target count since the copy-if may have eliminated some invalid targets
+  target_count = std::min(static_cast<int64_t>(std::distance(out_itr, copy_end)), target_count);
+  targets_positions.resize(target_count, stream);
+  targets_indices.resize(target_count, stream);
   auto d_positions       = targets_positions.data();
-
-  auto const copy_itr =
-    cudf::detail::make_counting_transform_iterator(0, pair_generator{fn, chars_bytes});
-  auto const copy_end = thrust::copy_if(
-    rmm::exec_policy(stream), copy_itr, copy_itr + chars_bytes, d_positions, copy_if_fn{});
+  auto d_targets_indices = targets_indices.data();
 
   // create a vector of offsets to each string's set of target positions
-  auto const targets_offsets = [&] {
-    auto string_indices = rmm::device_uvector<size_type>(target_count, stream);
-
-    auto const pos_itr = cudf::detail::make_counting_transform_iterator(
-      0, cuda::proclaim_return_type<int64_t>([d_positions] __device__(auto idx) -> int64_t {
-        return d_positions[idx].first;
-      }));
-    auto pos_count = std::distance(d_positions, copy_end);
-
-    auto begin =
-      cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
-    auto end = begin + input.offsets().size();
-    thrust::upper_bound(
-      rmm::exec_policy(stream), begin, end, pos_itr, pos_itr + pos_count, string_indices.begin());
-
-    // compute offsets per string
-    auto targets_offsets   = rmm::device_uvector<size_type>(strings_count + 1, stream);
-    auto d_targets_offsets = targets_offsets.data();
-
-    // memset to zero-out the target counts for any null-entries or strings with no targets
-    thrust::uninitialized_fill(
-      rmm::exec_policy(stream), targets_offsets.begin(), targets_offsets.end(), 0);
-
-    // next, count the number of targets per string
-    auto d_string_indices = string_indices.data();
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       target_count,
-                       [d_string_indices, d_targets_offsets] __device__(size_type idx) {
-                         auto const str_idx = d_string_indices[idx] - 1;
-                         atomicAdd(d_targets_offsets + str_idx, 1);
-                       });
-    // finally, convert the counts into offsets
-    thrust::exclusive_scan(rmm::exec_policy(stream),
-                           targets_offsets.begin(),
-                           targets_offsets.end(),
-                           targets_offsets.begin());
-    return targets_offsets;
-  }();
-  auto const d_targets_offsets = targets_offsets.data();
+  auto const targets_offsets = create_offsets_from_positions(
+    input, targets_positions, stream, rmm::mr::get_current_device_resource());
+  auto const d_targets_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(targets_offsets->view());
 
   // compute the number of string segments produced by replace in each string
   auto counts = rmm::device_uvector<size_type>(strings_count, stream);
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(strings_count),
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::counting_iterator<size_type>(0),
+                    thrust::counting_iterator<size_type>(strings_count),
                     counts.begin(),
                     cuda::proclaim_return_type<size_type>(
-                      [fn, d_positions, d_targets_offsets] __device__(size_type idx) -> size_type {
-                        return fn.count_strings(idx, d_positions, d_targets_offsets);
+                      [fn, d_positions, d_targets_indices, d_targets_offsets] __device__(
+                        size_type idx) -> size_type {
+                        return fn.count_strings(
+                          idx, d_positions, d_targets_indices, d_targets_offsets);
                       }));
 
   // create offsets from the counts
-  auto offsets =
-    std::get<0>(cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr));
-  auto const total_strings =
-    cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);
-  auto const d_strings_offsets = offsets->view().data<size_type>();
+  auto [offsets, total_strings] =
+    cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr);
+  auto const d_strings_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
 
   // build a vector of all the positions for all the strings
   auto indices   = rmm::device_uvector<string_index_pair>(total_strings, stream);
   auto d_indices = indices.data();
   auto d_sizes   = counts.data();  // reusing this vector to hold output sizes now
   thrust::for_each_n(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     thrust::make_counting_iterator<size_type>(0),
     strings_count,
-    [fn, d_strings_offsets, d_positions, d_targets_offsets, d_indices, d_sizes] __device__(
-      size_type idx) {
-      d_sizes[idx] =
-        fn.get_strings(idx, d_strings_offsets, d_positions, d_targets_offsets, d_indices);
+    [fn,
+     d_strings_offsets,
+     d_positions,
+     d_targets_indices,
+     d_targets_offsets,
+     d_indices,
+     d_sizes] __device__(size_type idx) {
+      d_sizes[idx] = fn.get_strings(
+        idx, d_strings_offsets, d_positions, d_targets_indices, d_targets_offsets, d_indices);
     });
 
   // use this utility to gather the string parts into a contiguous chars column
@@ -376,8 +382,8 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
   auto chars_data = chars->release().data;
 
   // create offsets from the sizes
-  offsets =
-    std::get<0>(cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr));
+  offsets = std::get<0>(
+    cudf::strings::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr));
 
   // build the strings columns from the chars and offsets
   return make_strings_column(strings_count,
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index 1f752f543d0..2c548f2f7cd 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -14,20 +14,21 @@
  * limitations under the License.
  */
 
+#include "strings/split/split.cuh"
+
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/detail/char_tables.hpp>
+#include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/replace.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -39,11 +40,7 @@
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
-#include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/remove.h>
-#include <thrust/scan.h>
 #include <thrust/transform.h>
 
 namespace cudf {
@@ -52,505 +49,375 @@ namespace detail {
 namespace {
 
 /**
- * @brief Average string byte-length threshold for deciding character-level vs row-level parallel
- * algorithm.
+ * @brief Threshold to decide on using string or character-parallel functions.
+ *
+ * If the average byte length of a string in a column exceeds this value then
+ * the character-parallel function is used.
+ * Otherwise, a regular string-parallel function is used.
  *
- * This value was determined by running the replace string scalar benchmark against different
- * power-of-2 string lengths and observing the point at which the performance only improved for
- * all trials.
+ * This value was found using the replace-multi benchmark results using an
+ * RTX A6000.
  */
-constexpr size_type BYTES_PER_VALID_ROW_THRESHOLD = 64;
+constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 256;
 
 /**
- * @brief Function logic for the row-level parallelism replace API.
- *
- * This will perform a replace operation on each string.
+ * @brief Helper functions for performing character-parallel replace
  */
-struct replace_row_parallel_fn {
-  column_device_view const d_strings;
-  string_view const d_target;
-  string_view const d_repl;
-  int32_t const max_repl;
-  int32_t* d_offsets{};
-  char* d_chars{};
+struct replace_parallel_chars_fn {
+  __device__ inline char const* get_base_ptr() const { return d_strings.head<char>(); }
 
-  __device__ void operator()(size_type idx)
+  __device__ inline string_view const get_string(size_type idx) const
   {
-    if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
-      return;
-    }
-    auto const d_str   = d_strings.element<string_view>(idx);
-    char const* in_ptr = d_str.data();
-
-    char* out_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    auto max_n    = (max_repl < 0) ? d_str.length() : max_repl;
-    auto bytes    = d_str.size_bytes();
-    auto position = d_str.find(d_target);
-
-    size_type last_pos = 0;
-    while ((position != string_view::npos) && (max_n > 0)) {
-      if (out_ptr) {
-        auto const curr_pos = d_str.byte_offset(position);
-        out_ptr = copy_and_increment(out_ptr, in_ptr + last_pos, curr_pos - last_pos);  // copy left
-        out_ptr = copy_string(out_ptr, d_repl);                                         // copy repl
-        last_pos = curr_pos + d_target.size_bytes();
-      } else {
-        bytes += d_repl.size_bytes() - d_target.size_bytes();
-      }
-      position = d_str.find(d_target, position + d_target.length());
-      --max_n;
-    }
-    if (out_ptr)  // copy whats left (or right depending on your point of view)
-      memcpy(out_ptr, in_ptr + last_pos, d_str.size_bytes() - last_pos);
-    else
-      d_offsets[idx] = bytes;
+    return d_strings.element<string_view>(idx);
   }
-};
 
-/**
- * @brief Functor for detecting falsely-overlapped target positions.
- *
- * This functor examines target positions that have been flagged as potentially overlapped by
- * a previous target position and identifies the overlaps that are false. A false overlap can occur
- * when a target position is overlapped by another target position that is itself overlapped.
- *
- * For example, a target string of "+++" and string to search of "++++++" will generate 4 potential
- * target positions at char offsets 0 through 3. The targets at offsets 1, 2, and 3 will be flagged
- * as potential overlaps since a prior target position is within range of the target string length.
- * The targets at offset 1 and 2 are true overlaps, since the footprint of the valid target at
- * offset 0 overlaps with them. The target at offset 3 is not truly overlapped because it is only
- * overlapped by invalid targets, targets that were themselves overlapped by a valid target.
- */
-struct target_false_overlap_filter_fn {
-  size_type const* const d_overlap_pos_indices{};
-  size_type const* const d_target_positions{};
-  size_type const target_size{};
+  __device__ inline bool is_valid(size_type idx) const { return d_strings.is_valid(idx); }
 
-  __device__ bool operator()(size_type overlap_idx) const
+  /**
+   * @brief Returns true if the target string is found at the given byte position
+   * in the input strings column and is legally within a string row
+   *
+   * @param idx Index of the byte position in the chars column
+   */
+  __device__ bool is_target_within_row(int64_t idx) const
   {
-    if (overlap_idx == 0) {
-      // The first overlap has no prior overlap to chain, so it should be kept as an overlap.
-      return false;
+    auto const d_offsets = d_strings_offsets;
+    auto const d_chars   = get_base_ptr() + idx;
+    auto const d_tgt     = d_target;
+    auto const chars_end = chars_bytes + d_offsets[0];
+    if (!d_tgt.empty() && (idx + d_tgt.size_bytes() <= chars_end) &&
+        (d_tgt.compare(d_chars, d_tgt.size_bytes()) == 0)) {
+      auto const idx_itr =
+        thrust::upper_bound(thrust::seq, d_offsets, d_offsets + d_strings.size(), idx);
+      auto str_idx = static_cast<size_type>(thrust::distance(d_offsets, idx_itr) - 1);
+      auto d_str   = get_string(str_idx);
+      if ((d_chars + d_tgt.size_bytes()) <= (d_str.data() + d_str.size_bytes())) { return true; }
     }
+    return false;
+  }
 
-    size_type const this_pos_idx = d_overlap_pos_indices[overlap_idx];
-
-    // Searching backwards for the first target position index of an overlap that is not adjacent
-    // to its overlap predecessor. The result will be the first overlap in this chain of overlaps.
-    size_type first_overlap_idx = overlap_idx;
-    size_type first_pos_idx     = this_pos_idx;
-    while (first_overlap_idx > 0) {
-      size_type prev_pos_idx = d_overlap_pos_indices[--first_overlap_idx];
-      if (prev_pos_idx + 1 != first_pos_idx) { break; }
-      first_pos_idx = prev_pos_idx;
-    }
+  /**
+   * @brief Returns true if the target string found at the given byte position
+   *
+   * @param idx Index of the byte position in the chars column
+   */
+  __device__ bool has_target(int64_t idx) const
+  {
+    auto const d_chars = get_base_ptr() + d_strings_offsets[0] + idx;
+    return (!d_target.empty() && (idx + d_target.size_bytes() <= chars_bytes) &&
+            (d_target.compare(d_chars, d_target.size_bytes()) == 0));
+  }
 
-    // The prior target position to the first overlapped position in the chain is a valid target.
-    size_type valid_pos_idx = first_pos_idx - 1;
-    size_type valid_pos     = d_target_positions[valid_pos_idx];
-
-    // Walk forward from this valid target. Any targets within the range of this valid one are true
-    // overlaps. The first overlap beyond the range of this valid target is another valid target,
-    // as it was falsely overlapped by a target that was itself overlapped. Repeat until we get to
-    // the overlapped position being queried by this call.
-    while (valid_pos_idx < this_pos_idx) {
-      size_type next_pos_idx = valid_pos_idx + 1;
-      size_type next_pos     = d_target_positions[next_pos_idx];
-      // Every target position within the range of a valid target position is a true overlap.
-      while (next_pos < valid_pos + target_size) {
-        if (next_pos_idx == this_pos_idx) { return false; }
-        next_pos = d_target_positions[++next_pos_idx];
+  /**
+   * @brief Count the number of strings that will be produced by the replace
+   *
+   * This includes segments of the string that are not replaced as well as those
+   * that are replaced.
+   *
+   * @param idx Index of the row in d_strings to be processed
+   * @param d_positions Positions of the targets found in the chars column
+   * @param d_targets_offsets Offsets identify which target positions go with the current string
+   * @return Number of substrings resulting from the replace operations on this row
+   */
+  __device__ size_type count_strings(size_type idx,
+                                     int64_t const* d_positions,
+                                     cudf::detail::input_offsetalator d_targets_offsets) const
+  {
+    if (!is_valid(idx)) { return 0; }
+
+    auto const d_str     = get_string(idx);
+    auto const d_str_end = d_str.data() + d_str.size_bytes();
+    auto const base_ptr  = get_base_ptr();
+    auto max_n           = (maxrepl < 0) ? d_str.length() : maxrepl;
+
+    auto const target_offset = d_targets_offsets[idx];
+    auto const targets_size  = static_cast<size_type>(d_targets_offsets[idx + 1] - target_offset);
+    auto const positions     = d_positions + target_offset;
+
+    size_type count = 1;  // always at least one string
+    auto str_ptr    = d_str.data();
+    for (std::size_t i = 0; (i < targets_size) && (max_n > 0); ++i) {
+      auto const tgt_ptr = base_ptr + positions[i];
+      if (str_ptr <= tgt_ptr && tgt_ptr < d_str_end) {
+        auto const keep_size = static_cast<size_type>(thrust::distance(str_ptr, tgt_ptr));
+        if (keep_size > 0) { count++; }  // don't bother counting empty strings
+        if (!d_replacement.empty()) { count++; }
+        str_ptr += keep_size + d_target.size_bytes();
+        --max_n;
       }
-      valid_pos_idx = next_pos_idx;
-      valid_pos     = next_pos;
     }
-
-    // This was overlapped only by false overlaps and therefore is a valid target.
-    return true;
+    return count;
   }
-};
 
-/**
- * @brief Functor for replacing each target string with the replacement string.
- *
- * This will perform a replace operation at each target position.
- */
-struct target_replacer_fn {
-  device_span<size_type const> const d_target_positions;
-  char const* const d_in_chars{};
-  char* const d_out_chars{};
-  size_type const target_size{};
-  string_view const d_repl;
-  int32_t const in_char_offset = 0;
-
-  __device__ void operator()(size_type input_idx) const
+  /**
+   * @brief Retrieve the strings for each row
+   *
+   * This will return string segments as string_index_pair objects for
+   * parts of the string that are not replaced interlaced with the
+   * appropriate replacement string where replacement targets are found.
+   *
+   * This function is called only once to produce both the string_index_pair objects
+   * and the output row size in bytes.
+   *
+   * @param idx Index of the row in d_strings
+   * @param d_offsets Offsets to identify where to store the results of the replace for this string
+   * @param d_positions The target positions found in the chars column
+   * @param d_targets_offsets The offsets to identify which target positions go with this string
+   * @param d_all_strings The output of all the produced string segments
+   * @return The size in bytes of the output string for this row
+   */
+  __device__ size_type get_strings(size_type idx,
+                                   cudf::detail::input_offsetalator const d_offsets,
+                                   int64_t const* d_positions,
+                                   cudf::detail::input_offsetalator d_targets_offsets,
+                                   string_index_pair* d_all_strings) const
   {
-    // Calculate the adjustment from input index to output index for each prior target position.
-    auto const repl_size         = d_repl.size_bytes();
-    auto const idx_delta_per_pos = repl_size - target_size;
-
-    // determine the number of target positions at or before this character position
-    size_type const* next_target_pos_ptr = thrust::upper_bound(
-      thrust::seq, d_target_positions.begin(), d_target_positions.end(), input_idx);
-    size_type const num_prev_targets = next_target_pos_ptr - d_target_positions.data();
-    size_type output_idx = input_idx - in_char_offset + idx_delta_per_pos * num_prev_targets;
-
-    if (num_prev_targets == 0) {
-      // not within a target string
-      d_out_chars[output_idx] = d_in_chars[input_idx];
-    } else {
-      // check if this input position is within a target string
-      size_type const prev_target_pos = *(next_target_pos_ptr - 1);
-      size_type target_idx            = input_idx - prev_target_pos;
-      if (target_idx < target_size) {
-        // within the target string, so the original calculation was off by one target string
-        output_idx -= idx_delta_per_pos;
-
-        // Copy the corresponding byte from the replacement string. If the replacement string is
-        // larger than the target string then the thread reading the last target byte is
-        // responsible for copying the remainder of the replacement string.
-        if (target_idx < repl_size) {
-          d_out_chars[output_idx++] = d_repl.data()[target_idx++];
-          if (target_idx == target_size) {
-            memcpy(d_out_chars + output_idx, d_repl.data() + target_idx, repl_size - target_idx);
-          }
+    if (!is_valid(idx)) { return 0; }
+
+    auto const d_output  = d_all_strings + d_offsets[idx];
+    auto const d_str     = get_string(idx);
+    auto const d_str_end = d_str.data() + d_str.size_bytes();
+    auto const base_ptr  = get_base_ptr();
+    auto max_n           = (maxrepl < 0) ? d_str.length() : maxrepl;
+
+    auto const target_offset = d_targets_offsets[idx];
+    auto const targets_size  = static_cast<size_type>(d_targets_offsets[idx + 1] - target_offset);
+    auto const positions     = d_positions + target_offset;
+
+    size_type output_idx  = 0;
+    size_type output_size = 0;
+    auto str_ptr          = d_str.data();
+    for (std::size_t i = 0; (i < targets_size) && (max_n > 0); ++i) {
+      auto const tgt_ptr = base_ptr + positions[i];
+      if (str_ptr <= tgt_ptr && tgt_ptr < d_str_end) {
+        auto const keep_size = static_cast<size_type>(thrust::distance(str_ptr, tgt_ptr));
+        if (keep_size > 0) { d_output[output_idx++] = string_index_pair{str_ptr, keep_size}; }
+        output_size += keep_size;
+
+        if (!d_replacement.empty()) {
+          d_output[output_idx++] =
+            string_index_pair{d_replacement.data(), d_replacement.size_bytes()};
         }
-      } else {
-        // not within a target string
-        d_out_chars[output_idx] = d_in_chars[input_idx];
+        output_size += d_replacement.size_bytes();
+
+        str_ptr += keep_size + d_target.size_bytes();
+        --max_n;
       }
     }
+    // include any leftover parts of the string
+    if (str_ptr <= d_str_end) {
+      auto const left_size = static_cast<size_type>(thrust::distance(str_ptr, d_str_end));
+      d_output[output_idx] = string_index_pair{str_ptr, left_size};
+      output_size += left_size;
+    }
+    return output_size;
   }
+
+  replace_parallel_chars_fn(column_device_view const& d_strings,
+                            cudf::detail::input_offsetalator d_strings_offsets,
+                            int64_t chars_bytes,
+                            string_view d_target,
+                            string_view d_replacement,
+                            cudf::size_type maxrepl)
+    : d_strings(d_strings),
+      d_strings_offsets(d_strings_offsets),
+      chars_bytes(chars_bytes),
+      d_target{d_target},
+      d_replacement{d_replacement},
+      maxrepl(maxrepl)
+  {
+  }
+
+ protected:
+  column_device_view d_strings;
+  cudf::detail::input_offsetalator d_strings_offsets;
+  int64_t chars_bytes;
+  string_view d_target;
+  string_view d_replacement;
+  cudf::size_type maxrepl;
 };
 
-/**
- * @brief Filter target positions that are overlapped by other, valid target positions.
- *
- * This performs an in-place modification of the target positions to remove any target positions
- * that are overlapped by other, valid target positions. For example, if the target string is "++"
- * and the string to search is "+++" then there will be two potential targets at character offsets
- * 0 and 1. The target at offset 0 is valid and overlaps the target at offset 1, invalidating the
- * target at offset 1.
- *
- * @param[in,out] d_target_positions Potential target positions to filter in-place.
- * @param[in]     target_count       Number of potential target positions.
- * @param[in]     target_size        Size of the target string in bytes.
- * @param[in]     stream             CUDA stream to use for device operations.
- * @return Number of target positions after filtering.
- */
-size_type filter_overlap_target_positions(size_type* d_target_positions,
-                                          size_type target_count,
-                                          size_type target_size,
-                                          rmm::cuda_stream_view stream)
+std::unique_ptr<column> replace_character_parallel(strings_column_view const& input,
+                                                   string_view const& d_target,
+                                                   string_view const& d_replacement,
+                                                   cudf::size_type maxrepl,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
 {
-  auto overlap_detector = [d_target_positions, target_size] __device__(size_type pos_idx) -> bool {
-    return (pos_idx > 0)
-             ? d_target_positions[pos_idx] - d_target_positions[pos_idx - 1] < target_size
-             : false;
-  };
-
-  // count the potential number of overlapped target positions
-  size_type overlap_count =
-    thrust::count_if(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     thrust::make_counting_iterator<size_type>(target_count),
-                     overlap_detector);
-  if (overlap_count == 0) { return target_count; }
-
-  // create a vector indexing the potential overlapped target positions
-  rmm::device_uvector<size_type> potential_overlapped_pos_indices(overlap_count, stream);
-  auto d_potential_overlapped_pos_indices = potential_overlapped_pos_indices.data();
-  thrust::copy_if(rmm::exec_policy(stream),
-                  thrust::make_counting_iterator<size_type>(0),
-                  thrust::make_counting_iterator<size_type>(target_count),
-                  d_potential_overlapped_pos_indices,
-                  overlap_detector);
-
-  // filter out the false overlaps that are actually valid
-  rmm::device_uvector<size_type> overlapped_pos_indices(overlap_count, stream);
-  auto d_overlapped_pos_indices = overlapped_pos_indices.data();
-  auto overlap_end =
-    thrust::remove_copy_if(rmm::exec_policy(stream),
-                           d_potential_overlapped_pos_indices,
-                           d_potential_overlapped_pos_indices + overlap_count,
-                           thrust::make_counting_iterator<size_type>(0),
-                           d_overlapped_pos_indices,
-                           target_false_overlap_filter_fn{
-                             d_potential_overlapped_pos_indices, d_target_positions, target_size});
-  overlap_count = cudf::distance(d_overlapped_pos_indices, overlap_end);
-
-  // In-place remove any target positions that are overlapped by valid target positions
-  auto target_pos_end = thrust::remove_if(
-    rmm::exec_policy(stream),
-    d_target_positions,
-    d_target_positions + target_count,
+  auto d_strings = column_device_view::create(input.parent(), stream);
+
+  auto const strings_count = input.size();
+  auto const chars_offset  = get_offset_value(input.offsets(), input.offset(), stream);
+  auto const chars_bytes =
+    get_offset_value(input.offsets(), input.offset() + strings_count, stream) - chars_offset;
+
+  auto const offsets_begin =
+    cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
+
+  replace_parallel_chars_fn fn{
+    *d_strings, offsets_begin, chars_bytes, d_target, d_replacement, maxrepl};
+
+  // Count the number of targets in the entire column.
+  // Note this may over-count in the case where a target spans adjacent strings.
+  auto target_count = thrust::count_if(rmm::exec_policy_nosync(stream),
+                                       thrust::make_counting_iterator<int64_t>(0),
+                                       thrust::make_counting_iterator<int64_t>(chars_bytes),
+                                       [fn] __device__(int64_t idx) { return fn.has_target(idx); });
+
+  // Create a vector of every target position in the chars column.
+  // These may also include overlapping targets which will be resolved later.
+  auto targets_positions = rmm::device_uvector<int64_t>(target_count, stream);
+  auto const copy_itr    = thrust::counting_iterator<int64_t>(chars_offset);
+  auto const copy_end    = cudf::detail::copy_if_safe(
+    copy_itr,
+    copy_itr + chars_bytes + chars_offset,
+    targets_positions.begin(),
+    [fn] __device__(int64_t idx) { return fn.is_target_within_row(idx); },
+    stream);
+
+  // adjust target count since the copy-if may have eliminated some invalid targets
+  target_count = std::min(std::distance(targets_positions.begin(), copy_end), target_count);
+  targets_positions.resize(target_count, stream);
+  auto d_positions = targets_positions.data();
+
+  // create a vector of offsets to each string's set of target positions
+  auto const targets_offsets = create_offsets_from_positions(
+    input, targets_positions, stream, rmm::mr::get_current_device_resource());
+  auto const d_targets_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(targets_offsets->view());
+
+  // compute the number of string segments produced by replace in each string
+  auto counts = rmm::device_uvector<size_type>(strings_count, stream);
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::counting_iterator<size_type>(0),
+                    thrust::counting_iterator<size_type>(strings_count),
+                    counts.begin(),
+                    cuda::proclaim_return_type<size_type>(
+                      [fn, d_positions, d_targets_offsets] __device__(size_type idx) -> size_type {
+                        return fn.count_strings(idx, d_positions, d_targets_offsets);
+                      }));
+
+  // create offsets from the counts
+  auto [offsets, total_strings] =
+    cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr);
+  auto const d_strings_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
+
+  // build a vector of all the positions for all the strings
+  auto indices   = rmm::device_uvector<string_index_pair>(total_strings, stream);
+  auto d_indices = indices.data();
+  auto d_sizes   = counts.data();  // reusing this vector to hold output sizes now
+  thrust::for_each_n(
+    rmm::exec_policy_nosync(stream),
     thrust::make_counting_iterator<size_type>(0),
-    [d_overlapped_pos_indices, overlap_count] __device__(size_type target_position_idx) -> bool {
-      return thrust::binary_search(thrust::seq,
-                                   d_overlapped_pos_indices,
-                                   d_overlapped_pos_indices + overlap_count,
-                                   target_position_idx);
+    strings_count,
+    [fn, d_strings_offsets, d_positions, d_targets_offsets, d_indices, d_sizes] __device__(
+      size_type idx) {
+      d_sizes[idx] =
+        fn.get_strings(idx, d_strings_offsets, d_positions, d_targets_offsets, d_indices);
     });
-  return cudf::distance(d_target_positions, target_pos_end);
-}
 
-/**
- * @brief Filter target positions to remove any invalid target positions.
- *
- * This performs an in-place modification of the target positions to remove any target positions
- * that are invalid, either by the target string overlapping a row boundary or being overlapped by
- * another valid target string.
- *
- * @param[in,out] target_positions Potential target positions to filter in-place.
- * @param[in]     d_offsets_span   Memory range encompassing the string column offsets.
- * @param[in]     target_size      Size of the target string in bytes.
- * @param[in]     stream           CUDA stream to use for device operations.
- * @return Number of target positions after filtering.
- */
-size_type filter_false_target_positions(rmm::device_uvector<size_type>& target_positions,
-                                        device_span<int32_t const> d_offsets_span,
-                                        size_type target_size,
-                                        rmm::cuda_stream_view stream)
-{
-  // In-place remove any positions for target strings that crossed string boundaries.
-  auto d_target_positions = target_positions.data();
-  auto target_pos_end =
-    thrust::remove_if(rmm::exec_policy(stream),
-                      d_target_positions,
-                      d_target_positions + target_positions.size(),
-                      [d_offsets_span, target_size] __device__(size_type target_pos) -> bool {
-                        // find the end of the string containing the start of this target
-                        size_type const* offset_ptr = thrust::upper_bound(
-                          thrust::seq, d_offsets_span.begin(), d_offsets_span.end(), target_pos);
-                        return target_pos + target_size > *offset_ptr;
-                      });
-  auto const target_count = cudf::distance(d_target_positions, target_pos_end);
-  if (target_count == 0) { return 0; }
-
-  // Filter out target positions that are the result of overlapping target matches.
-  return (target_count > 1)
-           ? filter_overlap_target_positions(d_target_positions, target_count, target_size, stream)
-           : target_count;
-}
+  // use this utility to gather the string parts into a contiguous chars column
+  auto chars      = make_strings_column(indices.begin(), indices.end(), stream, mr);
+  auto chars_data = chars->release().data;
 
-/**
- * @brief Filter target positions beyond the maximum target replacements per row limit.
- *
- * This performs an in-place modification of the target positions to remove any target positions
- * corresponding to targets that should not be replaced due to the maximum target replacement per
- * row limit.
- *
- * @param[in,out] target_positions Target positions to filter in-place.
- * @param[in]     target_count     Number of target positions.
- * @param[in]     d_offsets_span   Memory range encompassing the string column offsets.
- * @param[in]     max_repl_per_row Maximum target replacements per row limit.
- * @param[in]     stream           CUDA stream to use for device operations.
- * @return Number of target positions after filtering.
- */
-size_type filter_maxrepl_target_positions(size_type* d_target_positions,
-                                          size_type target_count,
-                                          device_span<int32_t const> d_offsets_span,
-                                          size_type max_repl_per_row,
-                                          rmm::cuda_stream_view stream)
-{
-  auto pos_to_row_fn = cuda::proclaim_return_type<size_type>(
-    [d_offsets_span] __device__(size_type target_pos) -> size_type {
-      auto upper_bound =
-        thrust::upper_bound(thrust::seq, d_offsets_span.begin(), d_offsets_span.end(), target_pos);
-      return thrust::distance(d_offsets_span.begin(), upper_bound);
-    });
+  // create offsets from the sizes
+  offsets = std::get<0>(
+    cudf::strings::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr));
 
-  // compute the match count per row for each target position
-  rmm::device_uvector<size_type> match_counts(target_count, stream);
-  auto d_match_counts = match_counts.data();
-  thrust::inclusive_scan_by_key(
-    rmm::exec_policy(stream),
-    thrust::make_transform_iterator(d_target_positions, pos_to_row_fn),
-    thrust::make_transform_iterator(d_target_positions + target_count, pos_to_row_fn),
-    thrust::make_constant_iterator<size_type>(1),
-    d_match_counts);
-
-  // In-place remove any positions that exceed the per-row match limit
-  auto target_pos_end =
-    thrust::remove_if(rmm::exec_policy(stream),
-                      d_target_positions,
-                      d_target_positions + target_count,
-                      d_match_counts,
-                      [max_repl_per_row] __device__(size_type match_count) -> bool {
-                        return match_count > max_repl_per_row;
-                      });
-
-  return cudf::distance(d_target_positions, target_pos_end);
+  // build the strings columns from the chars and offsets
+  return make_strings_column(strings_count,
+                             std::move(offsets),
+                             std::move(chars_data.release()[0]),
+                             input.null_count(),
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
 
 /**
- * @brief Scalar string replacement using a character-level parallel algorithm.
- *
- * Replaces occurrences of the target string with the replacement string using an algorithm with
- * character-level parallelism. This algorithm will perform well when the strings in the string
- * column are relatively long.
- * @see BYTES_PER_VALID_ROW_THRESHOLD
+ * @brief Function logic for the replace_string_parallel
  *
- * @param strings     String column to search for target strings.
- * @param chars_start Offset of the first character in the string column.
- * @param chars_end   Offset beyond the last character in the string column to search.
- * @param d_target    String to search for within the string column.
- * @param d_repl      Replacement string if target string is found.
- * @param maxrepl     Maximum times to replace if target appears multiple times in a string.
- * @param stream      CUDA stream to use for device operations
- * @param mr          Device memory resource used to allocate the returned column's device memory
- * @return New strings column.
+ * Performs the multi-replace operation with a thread per string.
+ * This performs best on smaller strings. @see AVG_CHAR_BYTES_THRESHOLD
  */
-std::unique_ptr<column> replace_char_parallel(strings_column_view const& strings,
-                                              size_type chars_start,
-                                              size_type chars_end,
-                                              string_view const& d_target,
-                                              string_view const& d_repl,
-                                              int32_t maxrepl,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
-{
-  auto const strings_count = strings.size();
-  auto const offset_count  = strings_count + 1;
-  auto const d_offsets   = strings.offsets().begin<int32_t>() + strings.offset();  // TODO: PR 14824
-  auto const d_in_chars  = strings.chars_begin(stream);
-  auto const chars_bytes = chars_end - chars_start;
-  auto const target_size = d_target.size_bytes();
-
-  // detect a target match at the specified byte position
-  device_span<char const> const d_chars_span(d_in_chars, chars_end);
-  auto target_detector = [d_chars_span, d_target] __device__(size_type char_idx) {
-    auto target_size = d_target.size_bytes();
-    auto target_ptr  = d_chars_span.begin() + char_idx;
-    return target_ptr + target_size <= d_chars_span.end() &&
-           d_target.compare(target_ptr, target_size) == 0;
-  };
-
-  // Count target string matches across all character positions, ignoring string boundaries and
-  // overlapping target strings. This may produce false-positives.
-  size_type target_count = thrust::count_if(rmm::exec_policy(stream),
-                                            thrust::make_counting_iterator<size_type>(chars_start),
-                                            thrust::make_counting_iterator<size_type>(chars_end),
-                                            target_detector);
-  if (target_count == 0) {
-    // nothing to replace, copy the input column
-    return std::make_unique<cudf::column>(strings.parent(), stream, mr);
-  }
+struct replace_fn {
+  column_device_view const d_strings;
+  string_view d_target;
+  string_view d_replacement;
+  cudf::size_type maxrepl;
+  cudf::size_type* d_offsets{};
+  char* d_chars{};
 
-  // create a vector of the potential target match positions
-  rmm::device_uvector<size_type> target_positions(target_count, stream);
-  auto d_target_positions = target_positions.data();
-  thrust::copy_if(rmm::exec_policy(stream),
-                  thrust::make_counting_iterator<size_type>(chars_start),
-                  thrust::make_counting_iterator<size_type>(chars_end),
-                  d_target_positions,
-                  target_detector);
-
-  device_span<int32_t const> d_offsets_span(d_offsets, offset_count);
-  if (target_size > 1) {
-    target_count =
-      filter_false_target_positions(target_positions, d_offsets_span, target_size, stream);
-    if (target_count == 0) {
-      // nothing to replace, copy the input column
-      return std::make_unique<cudf::column>(strings.parent(), stream, mr);
+  __device__ void operator()(size_type idx)
+  {
+    if (d_strings.is_null(idx)) {
+      if (!d_chars) { d_offsets[idx] = 0; }
+      return;
     }
-  }
+    auto const d_str   = d_strings.element<string_view>(idx);
+    char const* in_ptr = d_str.data();
 
-  // filter out any target positions that exceed the per-row match limit
-  if (maxrepl > 0 && target_count > maxrepl) {
-    target_count = filter_maxrepl_target_positions(
-      d_target_positions, target_count, d_offsets_span, maxrepl, stream);
+    size_type bytes = d_str.size_bytes();
+    size_type spos  = 0;
+    size_type lpos  = 0;
+    char* out_ptr   = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    auto max_n      = (maxrepl < 0) ? d_str.length() : maxrepl;
+
+    // check each character against each target
+    while (spos < d_str.size_bytes() && (max_n > 0)) {
+      auto const d_tgt = d_target;
+      if ((d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) &&    // check fit
+          (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0))  // and match
+      {
+        auto const d_repl = d_replacement;
+        bytes += d_repl.size_bytes() - d_tgt.size_bytes();
+        if (out_ptr) {
+          out_ptr = copy_and_increment(out_ptr, in_ptr + lpos, spos - lpos);
+          out_ptr = copy_string(out_ptr, d_repl);
+          lpos    = spos + d_tgt.size_bytes();
+        }
+        spos += d_tgt.size_bytes() - 1;
+        --max_n;
+      }
+      ++spos;
+    }
+    if (out_ptr) {  // copy remainder
+      memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
+    } else {
+      d_offsets[idx] = bytes;
+    }
   }
+};
 
-  // build the offsets column
-  auto offsets_column = make_numeric_column(
-    data_type{type_id::INT32}, offset_count, mask_state::UNALLOCATED, stream, mr);
-  auto offsets_view     = offsets_column->mutable_view();
-  auto delta_per_target = d_repl.size_bytes() - target_size;
-  device_span<size_type const> d_target_positions_span(d_target_positions, target_count);
-  auto offsets_update_fn = cuda::proclaim_return_type<int32_t>(
-    [d_target_positions_span, delta_per_target, chars_start] __device__(int32_t offset) -> int32_t {
-      // determine the number of target positions occurring before this offset
-      size_type const* next_target_pos_ptr = thrust::lower_bound(
-        thrust::seq, d_target_positions_span.begin(), d_target_positions_span.end(), offset);
-      size_type num_prev_targets =
-        thrust::distance(d_target_positions_span.data(), next_target_pos_ptr);
-      return offset - chars_start + delta_per_target * num_prev_targets;
-    });
-  thrust::transform(rmm::exec_policy(stream),
-                    d_offsets_span.begin(),
-                    d_offsets_span.end(),
-                    offsets_view.begin<int32_t>(),
-                    offsets_update_fn);
-
-  // build the characters column
-  rmm::device_uvector<char> chars(chars_bytes + (delta_per_target * target_count), stream, mr);
-  auto d_out_chars = chars.data();
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(chars_start),
-    chars_bytes,
-    target_replacer_fn{
-      d_target_positions_span, d_in_chars, d_out_chars, target_size, d_repl, chars_start});
-
-  // free the target positions buffer as it is no longer needed
-  (void)target_positions.release();
-
-  return make_strings_column(strings_count,
-                             std::move(offsets_column),
-                             chars.release(),
-                             strings.null_count(),
-                             cudf::detail::copy_bitmask(strings.parent(), stream, mr));
-}
-
-/**
- * @brief Scalar string replacement using a row-level parallel algorithm.
- *
- * Replaces occurrences of the target string with the replacement string using an algorithm with
- * row-level parallelism. This algorithm will perform well when the strings in the string
- * column are relatively short.
- * @see BYTES_PER_VALID_ROW_THRESHOLD
- *
- * @param strings     String column to search for target strings.
- * @param d_target    String to search for within the string column.
- * @param d_repl      Replacement string if target string is found.
- * @param maxrepl     Maximum times to replace if target appears multiple times in a string.
- * @param stream      CUDA stream to use for device operations
- * @param mr          Device memory resource used to allocate the returned column's device memory
- * @return New strings column.
- */
-std::unique_ptr<column> replace_row_parallel(strings_column_view const& strings,
-                                             string_view const& d_target,
-                                             string_view const& d_repl,
-                                             int32_t maxrepl,
-                                             rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> replace_string_parallel(strings_column_view const& input,
+                                                string_view const& d_target,
+                                                string_view const& d_replacement,
+                                                cudf::size_type maxrepl,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr)
 {
-  auto d_strings = column_device_view::create(strings.parent(), stream);
+  auto d_strings = column_device_view::create(input.parent(), stream);
 
-  // this utility calls the given functor to build the offsets and chars columns
   auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
-    replace_row_parallel_fn{*d_strings, d_target, d_repl, maxrepl}, strings.size(), stream, mr);
+    replace_fn{*d_strings, d_target, d_replacement, maxrepl}, input.size(), stream, mr);
 
-  return make_strings_column(strings.size(),
+  return make_strings_column(input.size(),
                              std::move(offsets_column),
                              chars.release(),
-                             strings.null_count(),
-                             cudf::detail::copy_bitmask(strings.parent(), stream, mr));
+                             input.null_count(),
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
 
 }  // namespace
 
-std::unique_ptr<column> replace(strings_column_view const& strings,
+std::unique_ptr<column> replace(strings_column_view const& input,
                                 string_scalar const& target,
                                 string_scalar const& repl,
-                                int32_t maxrepl,
+                                cudf::size_type maxrepl,
                                 rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) return make_empty_column(type_id::STRING);
-  if (maxrepl == 0) return std::make_unique<cudf::column>(strings.parent(), stream, mr);
+  if (input.is_empty()) { return make_empty_column(type_id::STRING); }
+  if (maxrepl == 0) { return std::make_unique<cudf::column>(input.parent(), stream, mr); }
   CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid.");
   CUDF_EXPECTS(target.is_valid(stream), "Parameter target must be valid.");
   CUDF_EXPECTS(target.size() > 0, "Parameter target must not be empty string.");
@@ -558,25 +425,11 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
   string_view d_target(target.data(), target.size());
   string_view d_repl(repl.data(), repl.size());
 
-  // determine range of characters in the base column
-  auto const strings_count = strings.size();
-  auto const offset_count  = strings_count + 1;
-  auto const d_offsets     = strings.offsets().data<int32_t>() + strings.offset();
-  size_type const chars_start =
-    (strings.offset() == 0)
-      ? 0
-      : cudf::detail::get_value<int32_t>(strings.offsets(), strings.offset(), stream);
-  size_type const chars_end   = (offset_count == strings.offsets().size())
-                                  ? strings.chars_size(stream)
-                                  : cudf::detail::get_value<int32_t>(
-                                    strings.offsets(), strings.offset() + strings_count, stream);
-  size_type const chars_bytes = chars_end - chars_start;
-
-  auto const avg_bytes_per_row = chars_bytes / std::max(strings_count - strings.null_count(), 1);
-  return (avg_bytes_per_row < BYTES_PER_VALID_ROW_THRESHOLD)
-           ? replace_row_parallel(strings, d_target, d_repl, maxrepl, stream, mr)
-           : replace_char_parallel(
-               strings, chars_start, chars_end, d_target, d_repl, maxrepl, stream, mr);
+  return (input.size() == input.null_count() ||
+          ((input.chars_size(stream) / (input.size() - input.null_count())) <
+           AVG_CHAR_BYTES_THRESHOLD))
+           ? replace_string_parallel(input, d_target, d_repl, maxrepl, stream, mr)
+           : replace_character_parallel(input, d_target, d_repl, maxrepl, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/replace/replace_nulls.cu b/cpp/src/strings/replace/replace_nulls.cu
index 26fb1c7819f..bbca4997f57 100644
--- a/cpp/src/strings/replace/replace_nulls.cu
+++ b/cpp/src/strings/replace/replace_nulls.cu
@@ -36,18 +36,18 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
-std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
+std::unique_ptr<column> replace_nulls(strings_column_view const& input,
                                       string_scalar const& repl,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
-  size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(type_id::STRING);
+  size_type strings_count = input.size();
+  if (strings_count == 0) { return make_empty_column(type_id::STRING); }
   CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid.");
 
   string_view d_repl(repl.data(), repl.size());
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_strings      = *strings_column;
 
   // build offsets column
@@ -58,12 +58,12 @@ std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
     }));
   auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
     offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_offsets = offsets_column->view().data<int32_t>();
+  auto d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
 
   // build chars column
   rmm::device_uvector<char> chars(bytes, stream, mr);
   auto d_chars = chars.data();
-  thrust::for_each_n(rmm::exec_policy(stream),
+  thrust::for_each_n(rmm::exec_policy_nosync(stream),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
                      [d_strings, d_repl, d_offsets, d_chars] __device__(size_type idx) {
diff --git a/cpp/src/strings/replace/replace_slice.cu b/cpp/src/strings/replace/replace_slice.cu
index 041801336e6..c11664c86d4 100644
--- a/cpp/src/strings/replace/replace_slice.cu
+++ b/cpp/src/strings/replace/replace_slice.cu
@@ -50,7 +50,7 @@ struct replace_slice_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_offsets[idx] = 0; }
       return;
     }
     auto const d_str   = d_strings.element<string_view>(idx);
@@ -75,34 +75,37 @@ struct replace_slice_fn {
 
 }  // namespace
 
-std::unique_ptr<column> replace_slice(strings_column_view const& strings,
+std::unique_ptr<column> replace_slice(strings_column_view const& input,
                                       string_scalar const& repl,
                                       size_type start,
                                       size_type stop,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) return make_empty_column(type_id::STRING);
+  if (input.is_empty()) { return make_empty_column(type_id::STRING); }
   CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid.");
-  if (stop > 0) CUDF_EXPECTS(start <= stop, "Parameter start must be less than or equal to stop.");
+  if (stop > 0) {
+    CUDF_EXPECTS(start <= stop, "Parameter start must be less than or equal to stop.");
+  }
 
   string_view d_repl(repl.data(), repl.size());
 
-  auto d_strings = column_device_view::create(strings.parent(), stream);
+  auto d_strings = column_device_view::create(input.parent(), stream);
 
   // this utility calls the given functor to build the offsets and chars columns
   auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
-    replace_slice_fn{*d_strings, d_repl, start, stop}, strings.size(), stream, mr);
+    replace_slice_fn{*d_strings, d_repl, start, stop}, input.size(), stream, mr);
 
-  return make_strings_column(strings.size(),
+  return make_strings_column(input.size(),
                              std::move(offsets_column),
                              chars.release(),
-                             strings.null_count(),
-                             cudf::detail::copy_bitmask(strings.parent(), stream, mr));
+                             input.null_count(),
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
+
 }  // namespace detail
 
-std::unique_ptr<column> replace_slice(strings_column_view const& strings,
+std::unique_ptr<column> replace_slice(strings_column_view const& input,
                                       string_scalar const& repl,
                                       size_type start,
                                       size_type stop,
@@ -110,7 +113,7 @@ std::unique_ptr<column> replace_slice(strings_column_view const& strings,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_slice(strings, repl, start, stop, stream, mr);
+  return detail::replace_slice(input, repl, start, stop, stream, mr);
 }
 
 }  // namespace strings

From 5192b608eeed4bda9317c657253c3a5630aa4c5d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 3 Apr 2024 09:11:37 -1000
Subject: [PATCH 024/272] Align date_range defaults with pandas, support tz
 (#15139)

Precursor to https://github.com/rapidsai/cudf/issues/15116

* Aligns `date_range` signature with pandas, _technically_ an API breakage with `closed` changing defaults even though it still isn't supported
* Copies pandas behavior of allowing `date_range` with just two of `start/end/periods`
* Supports `tz` arg now

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15139
---
 python/cudf/cudf/core/tools/datetimes.py | 49 +++++++++++++-----------
 python/cudf/cudf/tests/test_datetime.py  | 16 ++++++++
 2 files changed, 43 insertions(+), 22 deletions(-)

diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 65f97c99934..ed8fca88acd 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -799,9 +799,11 @@ def date_range(
     periods=None,
     freq=None,
     tz=None,
-    normalize=False,
+    normalize: bool = False,
     name=None,
-    closed=None,
+    closed: Literal["left", "right", "both", "neither"] = "both",
+    *,
+    unit: Optional[str] = None,
 ):
     """Return a fixed frequency DatetimeIndex.
 
@@ -837,8 +839,13 @@ def date_range(
     name : str, default None
         Name of the resulting DatetimeIndex
 
-    closed : {None, 'left', 'right'}, optional
-        Not Supported
+    closed : {"left", "right", "both", "neither"}, default "both"
+        Whether to set each bound as closed or open.
+        Currently only "both" is supported
+
+    unit : str, default None
+        Specify the desired resolution of the result. Currently
+        not supported.
 
     Returns
     -------
@@ -875,11 +882,15 @@ def date_range(
                 '2026-04-23 08:00:00'],
                 dtype='datetime64[ns]')
     """
-    if tz is not None:
-        raise NotImplementedError("tz is currently unsupported.")
+    if closed != "both":
+        raise NotImplementedError(f"{closed=} is currently unsupported.")
+    if unit is not None:
+        raise NotImplementedError(f"{unit=} is currently unsupported.")
+    if normalize is not False:
+        raise NotImplementedError(f"{normalize=} is currently unsupported.")
 
-    if closed is not None:
-        raise NotImplementedError("closed is currently unsupported.")
+    if freq is None and any(arg is None for arg in (start, end, periods)):
+        freq = "D"
 
     if (start, end, periods, freq).count(None) > 1:
         raise ValueError(
@@ -894,7 +905,7 @@ def date_range(
             FutureWarning,
         )
 
-    dtype = np.dtype("<M8[ns]")
+    dtype = np.dtype("datetime64[ns]")
 
     if freq is None:
         # `start`, `end`, `periods` is specified, we treat the timestamps as
@@ -903,7 +914,7 @@ def date_range(
         end = cudf.Scalar(end, dtype=dtype).value.astype("int64")
         arr = cp.linspace(start=start, stop=end, num=periods)
         result = cudf.core.column.as_column(arr).astype("datetime64[ns]")
-        return cudf.DatetimeIndex._from_data({name: result})
+        return cudf.DatetimeIndex._from_data({name: result}).tz_localize(tz)
 
     # The code logic below assumes `freq` is defined. It is first normalized
     # into `DateOffset` for further computation with timestamps.
@@ -912,8 +923,8 @@ def date_range(
         offset = freq
     elif isinstance(freq, str):
         offset = pd.tseries.frequencies.to_offset(freq)
-        if not isinstance(offset, pd.tseries.offsets.Tick) and not isinstance(
-            offset, pd.tseries.offsets.Week
+        if not isinstance(
+            offset, (pd.tseries.offsets.Tick, pd.tseries.offsets.Week)
         ):
             raise ValueError(
                 f"Unrecognized frequency string {freq}. cuDF does "
@@ -923,7 +934,7 @@ def date_range(
     else:
         raise TypeError("`freq` must be a `str` or cudf.DateOffset object.")
 
-    if _has_mixed_freqeuency(offset):
+    if _has_fixed_frequency(offset) and _has_non_fixed_frequency(offset):
         raise NotImplementedError(
             "Mixing fixed and non-fixed frequency offset is unsupported."
         )
@@ -1001,7 +1012,9 @@ def date_range(
         arr = cp.arange(start=start, stop=stop, step=step, dtype="int64")
         res = cudf.core.column.as_column(arr).astype("datetime64[ns]")
 
-    return cudf.DatetimeIndex._from_data({name: res}, freq=freq)
+    return cudf.DatetimeIndex._from_data({name: res}, freq=freq).tz_localize(
+        tz
+    )
 
 
 def _has_fixed_frequency(freq: DateOffset) -> bool:
@@ -1026,14 +1039,6 @@ def _has_non_fixed_frequency(freq: DateOffset) -> bool:
     return len(freq.kwds.keys() & non_fixed_frequencies) > 0
 
 
-def _has_mixed_freqeuency(freq: DateOffset) -> bool:
-    """Utility to determine if `freq` contains mixed fixed and non-fixed
-    frequency offset. e.g. {months=1, days=5}
-    """
-
-    return _has_fixed_frequency(freq) and _has_non_fixed_frequency(freq)
-
-
 def _offset_to_nanoseconds_lower_bound(offset: DateOffset) -> int:
     """Given a DateOffset, which can consist of either fixed frequency or
     non-fixed frequency offset, convert to the smallest possible fixed
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 7c209078fd2..37ba7acf044 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2357,3 +2357,19 @@ def test_timezone_array_notimplemented():
 def test_to_datetime_errors_ignore_deprecated():
     with pytest.warns(FutureWarning):
         cudf.to_datetime("2001-01-01 00:04:45", errors="ignore")
+
+
+def test_date_range_freq_default():
+    result = pd.date_range("2020-01-01", periods=2, name="foo")
+    expected = cudf.date_range("2020-01-01", periods=2, name="foo")
+    assert_eq(result, expected)
+
+
+def test_date_range_tz():
+    result = pd.date_range("2020-01-01", periods=2, tz="UTC")
+    expected = cudf.date_range("2020-01-01", periods=2, tz="UTC")
+    assert_eq(result, expected)
+
+    result = pd.date_range("2020-01-01", "2020-01-02", periods=2, tz="UTC")
+    expected = cudf.date_range("2020-01-01", "2020-01-02", periods=2, tz="UTC")
+    assert_eq(result, expected)

From fbaad8a480d3b2755afe04431c5abe6c098224b4 Mon Sep 17 00:00:00 2001
From: Tanmay Gujar <tanmaygujar999@gmail.com>
Date: Wed, 3 Apr 2024 18:10:19 -0400
Subject: [PATCH 025/272] [FEA] Performance improvement for mixed left
 semi/anti join (#15288)

Current implementation of mixed semi/anti join probes the built hash table twice -- once to find the output table size and once to build the output. Since the upper bound on output table size is O(N) where N is the size of the left table, we can avoid probing twice and achieve a faster join implementation.

This implementation reserves the required upper memory bound, builds the output, and then collects the relevant output rows. This probes the hash table only once.

This PR also removes the size kernels for mixed semi join and output size parameters passed to the mixed semi join.

Closes #15250

# Benchmark Results from cudf repository

## mixed_left_semi_join_32bit (New implementation)

### [0] NVIDIA TITAN V
```
| Key Type | Payload Type | Nullable | Build Table Size | Probe Table Size | Samples |  CPU Time  | Noise |  GPU Time  | Noise |
|----------|--------------|----------|------------------|------------------|---------|------------|-------|------------|-------|
|      I32 |          I32 |        0 |           100000 |           100000 |   1920x | 266.239 us | 3.43% | 261.324 us | 2.84% |
|      I32 |          I32 |        0 |           100000 |           400000 |   1024x | 495.434 us | 1.18% | 490.544 us | 0.63% |
|      I32 |          I32 |        0 |         10000000 |         10000000 |     24x |  20.919 ms | 0.04% |  20.914 ms | 0.03% |
|      I32 |          I32 |        0 |         10000000 |         40000000 |     11x |  54.697 ms | 0.03% |  54.692 ms | 0.03% |
|      I32 |          I32 |        0 |         10000000 |        100000000 |     11x | 122.171 ms | 0.03% | 122.166 ms | 0.03% |
|      I32 |          I32 |        0 |         80000000 |        100000000 |     11x | 192.979 ms | 0.01% | 192.975 ms | 0.01% |
|      I32 |          I32 |        0 |        100000000 |        100000000 |     11x | 212.878 ms | 0.01% | 212.874 ms | 0.01% |
|      I32 |          I32 |        0 |         10000000 |        240000000 |     11x | 279.794 ms | 0.01% | 279.790 ms | 0.01% |
|      I32 |          I32 |        0 |         80000000 |        240000000 |     11x | 351.186 ms | 0.01% | 351.183 ms | 0.01% |
|      I32 |          I32 |        0 |        100000000 |        240000000 |     11x | 370.794 ms | 0.01% | 370.790 ms | 0.01% |
```

## mixed_left_semi_join_32bit (Old implementation)

### [0] NVIDIA TITAN V
```
| Key Type | Payload Type | Nullable | Build Table Size | Probe Table Size | Samples |  CPU Time  | Noise |  GPU Time  | Noise |
|----------|--------------|----------|------------------|------------------|---------|------------|-------|------------|-------|
|      I32 |          I32 |        0 |           100000 |           100000 |   1392x | 368.030 us | 3.05% | 363.065 us | 2.70% |
|      I32 |          I32 |        0 |           100000 |           400000 |    832x | 832.492 us | 0.84% | 827.586 us | 0.60% |
|      I32 |          I32 |        0 |         10000000 |         10000000 |     16x |  32.310 ms | 0.03% |  32.305 ms | 0.03% |
|      I32 |          I32 |        0 |         10000000 |         40000000 |     11x | 100.222 ms | 0.03% | 100.218 ms | 0.03% |
|      I32 |          I32 |        0 |         10000000 |        100000000 |     11x | 235.874 ms | 0.01% | 235.870 ms | 0.01% |
|      I32 |          I32 |        0 |         80000000 |        100000000 |     11x | 307.042 ms | 0.01% | 307.038 ms | 0.01% |
|      I32 |          I32 |        0 |        100000000 |        100000000 |     11x | 326.797 ms | 0.01% | 326.794 ms | 0.01% |
|      I32 |          I32 |        0 |         10000000 |        240000000 |     11x | 552.730 ms | 0.01% | 552.728 ms | 0.01% |
|      I32 |          I32 |        0 |         80000000 |        240000000 |     11x | 624.958 ms | 0.01% | 624.956 ms | 0.01% |
|      I32 |          I32 |        0 |        100000000 |        240000000 |     11x | 644.148 ms | 0.00% | 644.146 ms | 0.00% |
```

Authors:
  - Tanmay Gujar (https://github.com/tgujar)
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Yunsong Wang (https://github.com/PointKernel)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15288
---
 cpp/CMakeLists.txt                            |   1 -
 cpp/include/cudf/join.hpp                     |  90 +----
 cpp/src/join/mixed_join_kernels_semi.cu       |  31 +-
 cpp/src/join/mixed_join_kernels_semi.cuh      |  64 +---
 cpp/src/join/mixed_join_semi.cu               | 360 ++----------------
 cpp/src/join/mixed_join_size_kernels_semi.cu  | 125 ------
 cpp/tests/join/mixed_join_tests.cu            |  41 --
 java/src/main/java/ai/rapids/cudf/Table.java  | 146 -------
 java/src/main/native/src/TableJni.cpp         |  60 ---
 .../test/java/ai/rapids/cudf/TableTest.java   | 116 ------
 10 files changed, 42 insertions(+), 992 deletions(-)
 delete mode 100644 cpp/src/join/mixed_join_size_kernels_semi.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f1d43e3c35f..7c32474ea56 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -453,7 +453,6 @@ add_library(
   src/join/mixed_join_semi.cu
   src/join/mixed_join_size_kernel.cu
   src/join/mixed_join_size_kernel_nulls.cu
-  src/join/mixed_join_size_kernels_semi.cu
   src/join/semi_join.cu
   src/json/json_path.cu
   src/lists/contains.cu
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index b7a3129cfec..e343ad9ee32 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -944,9 +944,6 @@ mixed_full_join(
  * @param right_conditional The right table used for the conditional join
  * @param binary_predicate The condition on which to join
  * @param compare_nulls Whether or not null values join to each other or not
- * @param output_size_data An optional pair of values indicating the exact output size and the
- * number of matches for each row in the larger of the two input tables, left or right (may be
- * precomputed using the corresponding mixed_full_join_size API).
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -958,8 +955,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
   table_view const& left_conditional,
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
-  null_equality compare_nulls = null_equality::EQUAL,
-  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
+  null_equality compare_nulls         = null_equality::EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -996,9 +992,6 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
  * @param right_conditional The right table used for the conditional join
  * @param binary_predicate The condition on which to join
  * @param compare_nulls Whether or not null values join to each other or not
- * @param output_size_data An optional pair of values indicating the exact output size and the
- * number of matches for each row in the larger of the two input tables, left or right (may be
- * precomputed using the corresponding mixed_full_join_size API).
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -1010,8 +1003,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
   table_view const& left_conditional,
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
-  null_equality compare_nulls = null_equality::EQUAL,
-  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
+  null_equality compare_nulls         = null_equality::EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -1094,84 +1086,6 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
   null_equality compare_nulls         = null_equality::EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Returns the exact number of matches (rows) when performing a mixed
- * left semi join between the specified tables where the columns of the
- * equality table are equal and the predicate evaluates to true on the
- * conditional tables.
- *
- * If the provided predicate returns NULL for a pair of rows (left, right),
- * that pair is not included in the output. It is the user's responsibility to
- * choose a suitable compare_nulls value AND use appropriate null-safe
- * operators in the expression.
- *
- * @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
- * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
- * match.
- * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
- * match.
- *
- * @param left_equality The left table used for the equality join
- * @param right_equality The right table used for the equality join
- * @param left_conditional The left table used for the conditional join
- * @param right_conditional The right table used for the conditional join
- * @param binary_predicate The condition on which to join
- * @param compare_nulls Whether or not null values join to each other or not
- * @param mr Device memory resource used to allocate the returned table and columns' device memory
- *
- * @return A pair containing the size that would result from performing the
- * requested join and the number of matches for each row in one of the two
- * tables. Which of the two tables is an implementation detail and should not
- * be relied upon, simply passed to the corresponding `mixed_left_join` API as
- * is.
- */
-std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_semi_join_size(
-  table_view const& left_equality,
-  table_view const& right_equality,
-  table_view const& left_conditional,
-  table_view const& right_conditional,
-  ast::expression const& binary_predicate,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Returns the exact number of matches (rows) when performing a mixed
- * left anti join between the specified tables.
- *
- * If the provided predicate returns NULL for a pair of rows (left, right),
- * that pair is not included in the output. It is the user's responsibility to
- * choose a suitable compare_nulls value AND use appropriate null-safe
- * operators in the expression.
- *
- * @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
- * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
- * match.
- * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
- * match.
- *
- * @param left_equality The left table used for the equality join
- * @param right_equality The right table used for the equality join
- * @param left_conditional The left table used for the conditional join
- * @param right_conditional The right table used for the conditional join
- * @param binary_predicate The condition on which to join
- * @param compare_nulls Whether or not null values join to each other or not
- * @param mr Device memory resource used to allocate the returned table and columns' device memory
- *
- * @return A pair containing the size that would result from performing the
- * requested join and the number of matches for each row in one of the two
- * tables. Which of the two tables is an implementation detail and should not
- * be relied upon, simply passed to the corresponding `mixed_left_join` API as
- * is.
- */
-std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_anti_join_size(
-  table_view const& left_equality,
-  table_view const& right_equality,
-  table_view const& left_conditional,
-  table_view const& right_conditional,
-  ast::expression const& binary_predicate,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 /**
  * @brief Returns the exact number of matches (rows) when performing a
  * conditional inner join between the specified tables where the predicate
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index 5a543997a50..01e3fe09b38 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -41,12 +41,9 @@ __attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
                        table_device_view build,
                        row_hash const hash_probe,
                        row_equality const equality_probe,
-                       join_kind const join_type,
                        cudf::detail::semi_map_type::device_view hash_table_view,
-                       size_type* join_output_l,
-                       cudf::ast::detail::expression_device_view device_expression_data,
-                       cudf::size_type const* join_result_offsets,
-                       bool const swap_tables)
+                       cudf::device_span<bool> left_table_keep_mask,
+                       cudf::ast::detail::expression_device_view device_expression_data)
 {
   // Normally the casting of a shared memory array is used to create multiple
   // arrays of different types from the shared memory buffer, but here it is
@@ -60,7 +57,7 @@ __attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
 
   cudf::size_type const left_num_rows  = left_table.num_rows();
   cudf::size_type const right_num_rows = right_table.num_rows();
-  auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
+  auto const outer_num_rows            = left_num_rows;
 
   cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size;
 
@@ -70,12 +67,10 @@ __attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
   if (outer_row_index < outer_num_rows) {
     // Figure out the number of elements for this key.
     auto equality = single_expression_equality<has_nulls>{
-      evaluator, thread_intermediate_storage, swap_tables, equality_probe};
+      evaluator, thread_intermediate_storage, false, equality_probe};
 
-    if ((join_type == join_kind::LEFT_ANTI_JOIN) !=
-        (hash_table_view.contains(outer_row_index, hash_probe, equality))) {
-      *(join_output_l + join_result_offsets[outer_row_index]) = outer_row_index;
-    }
+    left_table_keep_mask[outer_row_index] =
+      hash_table_view.contains(outer_row_index, hash_probe, equality);
   }
 }
 
@@ -86,12 +81,9 @@ template __global__ void mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, true>(
   table_device_view build,
   row_hash const hash_probe,
   row_equality const equality_probe,
-  join_kind const join_type,
   cudf::detail::semi_map_type::device_view hash_table_view,
-  size_type* join_output_l,
-  cudf::ast::detail::expression_device_view device_expression_data,
-  cudf::size_type const* join_result_offsets,
-  bool const swap_tables);
+  cudf::device_span<bool> left_table_keep_mask,
+  cudf::ast::detail::expression_device_view device_expression_data);
 
 template __global__ void mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, false>(
   table_device_view left_table,
@@ -100,12 +92,9 @@ template __global__ void mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, false>(
   table_device_view build,
   row_hash const hash_probe,
   row_equality const equality_probe,
-  join_kind const join_type,
   cudf::detail::semi_map_type::device_view hash_table_view,
-  size_type* join_output_l,
-  cudf::ast::detail::expression_device_view device_expression_data,
-  cudf::size_type const* join_result_offsets,
-  bool const swap_tables);
+  cudf::device_span<bool> left_table_keep_mask,
+  cudf::ast::detail::expression_device_view device_expression_data);
 
 }  // namespace detail
 
diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh
index f411d36f0a8..4ea404d451c 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cuh
+++ b/cpp/src/join/mixed_join_kernels_semi.cuh
@@ -27,53 +27,7 @@ namespace cudf {
 namespace detail {
 
 /**
- * @brief Computes the output size of joining the left table to the right table for semi/anti joins.
- *
- * This method probes the hash table with each row in the probe table using a
- * custom equality comparator that also checks that the conditional expression
- * evaluates to true between the left/right tables when a match is found
- * between probe and build rows.
- *
- * @tparam block_size The number of threads per block for this kernel
- * @tparam has_nulls Whether or not the inputs may contain nulls.
- *
- * @param[in] left_table The left table
- * @param[in] right_table The right table
- * @param[in] probe The table with which to probe the hash table for matches.
- * @param[in] build The table with which the hash table was built.
- * @param[in] hash_probe The hasher used for the probe table.
- * @param[in] equality_probe The equality comparator used when probing the hash table.
- * @param[in] join_type The type of join to be performed
- * @param[in] hash_table_view The hash table built from `build`.
- * @param[in] device_expression_data Container of device data required to evaluate the desired
- * expression.
- * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
- * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
- * @param[out] output_size The resulting output size
- * @param[out] matches_per_row The number of matches in one pair of
- * equality/conditional tables for each row in the other pair of tables. If
- * swap_tables is true, matches_per_row corresponds to the right_table,
- * otherwise it corresponds to the left_table. Note that corresponding swap of
- * left/right tables to determine which is the build table and which is the
- * probe table has already happened on the host.
- */
-template <int block_size, bool has_nulls>
-__global__ void compute_mixed_join_output_size_semi(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_hash const hash_probe,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::semi_map_type::device_view hash_table_view,
-  ast::detail::expression_device_view device_expression_data,
-  bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row);
-
-/**
- * @brief Performs a semi/anti join using the combination of a hash lookup to
+ * @brief Performs a semi join using the combination of a hash lookup to
  * identify equal rows between one pair of tables and the evaluation of an
  * expression containing an arbitrary expression.
  *
@@ -91,16 +45,11 @@ __global__ void compute_mixed_join_output_size_semi(
  * @param[in] build The table with which the hash table was built.
  * @param[in] hash_probe The hasher used for the probe table.
  * @param[in] equality_probe The equality comparator used when probing the hash table.
- * @param[in] join_type The type of join to be performed
  * @param[in] hash_table_view The hash table built from `build`.
- * @param[out] join_output_l The left result of the join operation
+ * @param[out] left_table_keep_mask The result of the join operation with "true" element indicating
+ * the corresponding index from left table is present in output
  * @param[in] device_expression_data Container of device data required to evaluate the desired
  * expression.
- * @param[in] join_result_offsets The starting indices in join_output[l|r]
- * where the matches for each row begin. Equivalent to a prefix sum of
- * matches_per_row.
- * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
- * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
  */
 template <cudf::size_type block_size, bool has_nulls>
 __global__ void mixed_join_semi(table_device_view left_table,
@@ -109,12 +58,9 @@ __global__ void mixed_join_semi(table_device_view left_table,
                                 table_device_view build,
                                 row_hash const hash_probe,
                                 row_equality const equality_probe,
-                                join_kind const join_type,
                                 cudf::detail::semi_map_type::device_view hash_table_view,
-                                size_type* join_output_l,
-                                cudf::ast::detail::expression_device_view device_expression_data,
-                                cudf::size_type const* join_result_offsets,
-                                bool const swap_tables);
+                                cudf::device_span<bool> left_table_keep_mask,
+                                cudf::ast::detail::expression_device_view device_expression_data);
 
 }  // namespace detail
 
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index edf6c32eadf..d654f580cad 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -92,7 +92,6 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
   join_kind join_type,
-  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
@@ -107,12 +106,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
 
   auto const right_num_rows{right_conditional.num_rows()};
   auto const left_num_rows{left_conditional.num_rows()};
-  auto const swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
-
-  // The "outer" table is the larger of the two tables. The kernels are
-  // launched with one thread per row of the outer table, which also means that
-  // it is the probe table for the hash
-  auto const outer_num_rows{swap_tables ? right_num_rows : left_num_rows};
+  auto const outer_num_rows{left_num_rows};
 
   // We can immediately filter out cases where the right table is empty. In
   // some cases, we return all the rows of the left table with a corresponding
@@ -155,8 +149,8 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   // TODO: The non-conditional join impls start with a dictionary matching,
   // figure out what that is and what it's needed for (and if conditional joins
   // need to do the same).
-  auto& probe                 = swap_tables ? right_equality : left_equality;
-  auto& build                 = swap_tables ? left_equality : right_equality;
+  auto& probe                 = left_equality;
+  auto& build                 = right_equality;
   auto probe_view             = table_device_view::create(probe, stream);
   auto build_view             = table_device_view::create(build, stream);
   auto left_conditional_view  = table_device_view::create(left_conditional, stream);
@@ -197,8 +191,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   auto const equality_build_equality =
     row_comparator_build.equal_to<false>(build_nulls, compare_nulls);
   auto const preprocessed_build_condtional =
-    experimental::row::equality::preprocessed_table::create(
-      swap_tables ? left_conditional : right_conditional, stream);
+    experimental::row::equality::preprocessed_table::create(right_conditional, stream);
   auto const row_comparator_conditional_build =
     cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional,
                                                             preprocessed_build_condtional};
@@ -225,84 +218,14 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
 
   auto hash_table_view = hash_table.get_device_view();
 
-  // For inner joins we support optimizing the join by launching one thread for
-  // whichever table is larger rather than always using the left table.
   detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
   auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
-  join_kind const kernel_join_type =
-    join_type == join_kind::FULL_JOIN ? join_kind::LEFT_JOIN : join_type;
-
-  // If the join size data was not provided as an input, compute it here.
-  std::size_t join_size;
-  // Using an optional because we only need to allocate a new vector if one was
-  // not passed as input, and rmm::device_uvector is not default constructible
-  std::optional<rmm::device_uvector<size_type>> matches_per_row{};
-  device_span<size_type const> matches_per_row_span{};
 
   auto const row_hash   = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
   auto const hash_probe = row_hash.device_hasher(has_nulls);
 
-  if (output_size_data.has_value()) {
-    join_size            = output_size_data->first;
-    matches_per_row_span = output_size_data->second;
-  } else {
-    // Allocate storage for the counter used to get the size of the join output
-    rmm::device_scalar<std::size_t> size(0, stream, mr);
-
-    matches_per_row =
-      rmm::device_uvector<size_type>{static_cast<std::size_t>(outer_num_rows), stream, mr};
-    // Note that the view goes out of scope after this else statement, but the
-    // data owned by matches_per_row stays alive so the data pointer is valid.
-    auto mutable_matches_per_row_span = cudf::device_span<size_type>{
-      matches_per_row->begin(), static_cast<std::size_t>(outer_num_rows)};
-    matches_per_row_span = cudf::device_span<size_type const>{
-      matches_per_row->begin(), static_cast<std::size_t>(outer_num_rows)};
-    if (has_nulls) {
-      compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, true>
-        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-          *left_conditional_view,
-          *right_conditional_view,
-          *probe_view,
-          *build_view,
-          hash_probe,
-          equality_probe,
-          kernel_join_type,
-          hash_table_view,
-          parser.device_expression_data,
-          swap_tables,
-          size.data(),
-          mutable_matches_per_row_span);
-    } else {
-      compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, false>
-        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-          *left_conditional_view,
-          *right_conditional_view,
-          *probe_view,
-          *build_view,
-          hash_probe,
-          equality_probe,
-          kernel_join_type,
-          hash_table_view,
-          parser.device_expression_data,
-          swap_tables,
-          size.data(),
-          mutable_matches_per_row_span);
-    }
-    join_size = size.value(stream);
-  }
-
-  if (join_size == 0) { return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr); }
-
-  // Given the number of matches per row, we need to compute the offsets for insertion.
-  auto join_result_offsets =
-    rmm::device_uvector<size_type>{static_cast<std::size_t>(outer_num_rows), stream, mr};
-  thrust::exclusive_scan(rmm::exec_policy{stream},
-                         matches_per_row_span.begin(),
-                         matches_per_row_span.end(),
-                         join_result_offsets.begin());
-
-  auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
-  auto const& join_output_l = left_indices->data();
+  // Vector used to indicate indices from left/probe table which are present in output
+  auto left_table_keep_mask = rmm::device_uvector<bool>(probe.num_rows(), stream);
 
   if (has_nulls) {
     mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, true>
@@ -313,12 +236,9 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
         *build_view,
         hash_probe,
         equality_probe,
-        kernel_join_type,
         hash_table_view,
-        join_output_l,
-        parser.device_expression_data,
-        join_result_offsets.data(),
-        swap_tables);
+        cudf::device_span<bool>(left_table_keep_mask),
+        parser.device_expression_data);
   } else {
     mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, false>
       <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
@@ -328,235 +248,30 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
         *build_view,
         hash_probe,
         equality_probe,
-        kernel_join_type,
         hash_table_view,
-        join_output_l,
-        parser.device_expression_data,
-        join_result_offsets.data(),
-        swap_tables);
+        cudf::device_span<bool>(left_table_keep_mask),
+        parser.device_expression_data);
   }
 
-  return left_indices;
-}
-
-std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>>
-compute_mixed_join_output_size_semi(table_view const& left_equality,
-                                    table_view const& right_equality,
-                                    table_view const& left_conditional,
-                                    table_view const& right_conditional,
-                                    ast::expression const& binary_predicate,
-                                    null_equality compare_nulls,
-                                    join_kind join_type,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(
-    (join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) &&
-      (join_type != join_kind::FULL_JOIN),
-    "Inner, left, and full join size estimation should use compute_mixed_join_output_size.");
-
-  CUDF_EXPECTS(left_conditional.num_rows() == left_equality.num_rows(),
-               "The left conditional and equality tables must have the same number of rows.");
-  CUDF_EXPECTS(right_conditional.num_rows() == right_equality.num_rows(),
-               "The right conditional and equality tables must have the same number of rows.");
-
-  auto const right_num_rows{right_conditional.num_rows()};
-  auto const left_num_rows{left_conditional.num_rows()};
-  auto const swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
-
-  // The "outer" table is the larger of the two tables. The kernels are
-  // launched with one thread per row of the outer table, which also means that
-  // it is the probe table for the hash
-  auto const outer_num_rows{swap_tables ? right_num_rows : left_num_rows};
-
-  auto matches_per_row = std::make_unique<rmm::device_uvector<size_type>>(
-    static_cast<std::size_t>(outer_num_rows), stream, mr);
-  auto matches_per_row_span = cudf::device_span<size_type>{
-    matches_per_row->begin(), static_cast<std::size_t>(outer_num_rows)};
-
-  // We can immediately filter out cases where one table is empty. In
-  // some cases, we return all the rows of the other table with a corresponding
-  // null index for the empty table; in others, we return an empty output.
-  if (right_num_rows == 0) {
-    switch (join_type) {
-      // Left, left anti, and full all return all the row indices from left
-      // with a corresponding NULL from the right.
-      case join_kind::LEFT_ANTI_JOIN: {
-        thrust::fill(matches_per_row->begin(), matches_per_row->end(), 1);
-        return {left_num_rows, std::move(matches_per_row)};
-      }
-      // Inner and left semi joins return empty output because no matches can exist.
-      case join_kind::LEFT_SEMI_JOIN: return {0, std::move(matches_per_row)};
-      default: CUDF_FAIL("Invalid join kind."); break;
-    }
-  } else if (left_num_rows == 0) {
-    switch (join_type) {
-      // Left, left anti, left semi, and inner joins all return empty sets.
-      case join_kind::LEFT_ANTI_JOIN:
-      case join_kind::LEFT_SEMI_JOIN: {
-        thrust::fill(matches_per_row->begin(), matches_per_row->end(), 0);
-        return {0, std::move(matches_per_row)};
-      }
-      default: CUDF_FAIL("Invalid join kind."); break;
-    }
-  }
-
-  // If evaluating the expression may produce null outputs we create a nullable
-  // output column and follow the null-supporting expression evaluation code
-  // path.
-  auto const has_nulls = cudf::nullate::DYNAMIC{
-    cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) ||
-    binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream)};
-
-  auto const parser = ast::detail::expression_parser{
-    binary_predicate, left_conditional, right_conditional, has_nulls, stream, mr};
-  CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8,
-               "The expression must produce a boolean output.");
-
-  // TODO: The non-conditional join impls start with a dictionary matching,
-  // figure out what that is and what it's needed for (and if conditional joins
-  // need to do the same).
-  auto& probe                 = swap_tables ? right_equality : left_equality;
-  auto& build                 = swap_tables ? left_equality : right_equality;
-  auto probe_view             = table_device_view::create(probe, stream);
-  auto build_view             = table_device_view::create(build, stream);
-  auto left_conditional_view  = table_device_view::create(left_conditional, stream);
-  auto right_conditional_view = table_device_view::create(right_conditional, stream);
-
-  auto const preprocessed_build =
-    experimental::row::equality::preprocessed_table::create(build, stream);
-  auto const preprocessed_probe =
-    experimental::row::equality::preprocessed_table::create(probe, stream);
-  auto const row_comparator =
-    cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build};
-  auto const equality_probe = row_comparator.equal_to<false>(has_nulls, compare_nulls);
-
-  semi_map_type hash_table{compute_hash_table_size(build.num_rows()),
-                           cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
-                           cuco::empty_value{cudf::detail::JoinNoneValue},
-                           cudf::detail::cuco_allocator{stream},
-                           stream.value()};
-
-  // Create hash table containing all keys found in right table
-  // TODO: To add support for nested columns we will need to flatten in many
-  // places. However, this probably isn't worth adding any time soon since we
-  // won't be able to support AST conditions for those types anyway.
-  auto const build_nulls    = cudf::nullate::DYNAMIC{cudf::has_nulls(build)};
-  auto const row_hash_build = cudf::experimental::row::hash::row_hasher{preprocessed_build};
-  auto const hash_build     = row_hash_build.device_hasher(build_nulls);
-  // Since we may see multiple rows that are identical in the equality tables
-  // but differ in the conditional tables, the equality comparator used for
-  // insertion must account for both sets of tables. An alternative solution
-  // would be to use a multimap, but that solution would store duplicates where
-  // equality and conditional rows are equal, so this approach is preferable.
-  // One way to make this solution even more efficient would be to only include
-  // the columns of the conditional table that are used by the expression, but
-  // that requires additional plumbing through the AST machinery and is out of
-  // scope for now.
-  auto const row_comparator_build =
-    cudf::experimental::row::equality::two_table_comparator{preprocessed_build, preprocessed_build};
-  auto const equality_build_equality =
-    row_comparator_build.equal_to<false>(build_nulls, compare_nulls);
-  auto const preprocessed_build_condtional =
-    experimental::row::equality::preprocessed_table::create(
-      swap_tables ? left_conditional : right_conditional, stream);
-  auto const row_comparator_conditional_build =
-    cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional,
-                                                            preprocessed_build_condtional};
-  auto const equality_build_conditional =
-    row_comparator_conditional_build.equal_to<false>(build_nulls, compare_nulls);
-  double_row_equality equality_build{equality_build_equality, equality_build_conditional};
-  make_pair_function_semi pair_func_build{};
-
-  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build);
-
-  // skip rows that are null here.
-  if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) {
-    hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value());
-  } else {
-    thrust::counting_iterator<cudf::size_type> stencil(0);
-    auto const [row_bitmask, _] =
-      cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource());
-    row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
-
-    // insert valid rows
-    hash_table.insert_if(
-      iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value());
-  }
-
-  auto hash_table_view = hash_table.get_device_view();
-
-  // For inner joins we support optimizing the join by launching one thread for
-  // whichever table is larger rather than always using the left table.
-  detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
-  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
-
-  // Allocate storage for the counter used to get the size of the join output
-  rmm::device_scalar<std::size_t> size(0, stream, mr);
-
-  auto const row_hash   = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
-  auto const hash_probe = row_hash.device_hasher(has_nulls);
-
-  // Determine number of output rows without actually building the output to simply
-  // find what the size of the output will be.
-  if (has_nulls) {
-    compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, true>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_conditional_view,
-        *right_conditional_view,
-        *probe_view,
-        *build_view,
-        hash_probe,
-        equality_probe,
-        join_type,
-        hash_table_view,
-        parser.device_expression_data,
-        swap_tables,
-        size.data(),
-        matches_per_row_span);
-  } else {
-    compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, false>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_conditional_view,
-        *right_conditional_view,
-        *probe_view,
-        *build_view,
-        hash_probe,
-        equality_probe,
-        join_type,
-        hash_table_view,
-        parser.device_expression_data,
-        swap_tables,
-        size.data(),
-        matches_per_row_span);
-  }
-
-  return {size.value(stream), std::move(matches_per_row)};
+  auto gather_map = std::make_unique<rmm::device_uvector<size_type>>(probe.num_rows(), stream, mr);
+
+  // gather_map_end will be the end of valid data in gather_map
+  auto gather_map_end =
+    thrust::copy_if(rmm::exec_policy(stream),
+                    thrust::counting_iterator<size_type>(0),
+                    thrust::counting_iterator<size_type>(probe.num_rows()),
+                    left_table_keep_mask.begin(),
+                    gather_map->begin(),
+                    [join_type] __device__(bool keep_row) {
+                      return keep_row == (join_type == detail::join_kind::LEFT_SEMI_JOIN);
+                    });
+
+  gather_map->resize(thrust::distance(gather_map->begin(), gather_map_end), stream);
+  return gather_map;
 }
 
 }  // namespace detail
 
-std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_semi_join_size(
-  table_view const& left_equality,
-  table_view const& right_equality,
-  table_view const& left_conditional,
-  table_view const& right_conditional,
-  ast::expression const& binary_predicate,
-  null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::compute_mixed_join_output_size_semi(left_equality,
-                                                     right_equality,
-                                                     left_conditional,
-                                                     right_conditional,
-                                                     binary_predicate,
-                                                     compare_nulls,
-                                                     detail::join_kind::LEFT_SEMI_JOIN,
-                                                     cudf::get_default_stream(),
-                                                     mr);
-}
-
 std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
   table_view const& left_equality,
   table_view const& right_equality,
@@ -564,7 +279,6 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
-  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data,
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
@@ -575,32 +289,10 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
                                  binary_predicate,
                                  compare_nulls,
                                  detail::join_kind::LEFT_SEMI_JOIN,
-                                 output_size_data,
                                  cudf::get_default_stream(),
                                  mr);
 }
 
-std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_anti_join_size(
-  table_view const& left_equality,
-  table_view const& right_equality,
-  table_view const& left_conditional,
-  table_view const& right_conditional,
-  ast::expression const& binary_predicate,
-  null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::compute_mixed_join_output_size_semi(left_equality,
-                                                     right_equality,
-                                                     left_conditional,
-                                                     right_conditional,
-                                                     binary_predicate,
-                                                     compare_nulls,
-                                                     detail::join_kind::LEFT_ANTI_JOIN,
-                                                     cudf::get_default_stream(),
-                                                     mr);
-}
-
 std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
   table_view const& left_equality,
   table_view const& right_equality,
@@ -608,7 +300,6 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
-  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data,
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
@@ -619,7 +310,6 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
                                  binary_predicate,
                                  compare_nulls,
                                  detail::join_kind::LEFT_ANTI_JOIN,
-                                 output_size_data,
                                  cudf::get_default_stream(),
                                  mr);
 }
diff --git a/cpp/src/join/mixed_join_size_kernels_semi.cu b/cpp/src/join/mixed_join_size_kernels_semi.cu
deleted file mode 100644
index 7a22ac60710..00000000000
--- a/cpp/src/join/mixed_join_size_kernels_semi.cu
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "join/join_common_utils.cuh"
-#include "join/join_common_utils.hpp"
-#include "join/mixed_join_common_utils.cuh"
-
-#include <cudf/ast/detail/expression_evaluator.cuh>
-#include <cudf/ast/detail/expression_parser.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/table/table_device_view.cuh>
-#include <cudf/utilities/span.hpp>
-
-#include <cub/cub.cuh>
-
-namespace cudf {
-namespace detail {
-
-namespace cg = cooperative_groups;
-
-#pragma GCC diagnostic ignored "-Wattributes"
-
-template <int block_size, bool has_nulls>
-__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
-  void compute_mixed_join_output_size_semi(
-    table_device_view left_table,
-    table_device_view right_table,
-    table_device_view probe,
-    table_device_view build,
-    row_hash const hash_probe,
-    row_equality const equality_probe,
-    join_kind const join_type,
-    cudf::detail::semi_map_type::device_view hash_table_view,
-    ast::detail::expression_device_view device_expression_data,
-    bool const swap_tables,
-    std::size_t* output_size,
-    cudf::device_span<cudf::size_type> matches_per_row)
-{
-  // The (required) extern storage of the shared memory array leads to
-  // conflicting declarations between different templates. The easiest
-  // workaround is to declare an arbitrary (here char) array type then cast it
-  // after the fact to the appropriate type.
-  extern __shared__ char raw_intermediate_storage[];
-  cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
-    reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
-  auto thread_intermediate_storage =
-    intermediate_storage + (threadIdx.x * device_expression_data.num_intermediates);
-
-  std::size_t thread_counter{0};
-  cudf::size_type const start_idx      = threadIdx.x + blockIdx.x * block_size;
-  cudf::size_type const stride         = block_size * gridDim.x;
-  cudf::size_type const left_num_rows  = left_table.num_rows();
-  cudf::size_type const right_num_rows = right_table.num_rows();
-  auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
-
-  auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
-    left_table, right_table, device_expression_data);
-
-  // TODO: Address asymmetry in operator.
-  auto equality = single_expression_equality<has_nulls>{
-    evaluator, thread_intermediate_storage, swap_tables, equality_probe};
-
-  for (cudf::size_type outer_row_index = start_idx; outer_row_index < outer_num_rows;
-       outer_row_index += stride) {
-    matches_per_row[outer_row_index] =
-      ((join_type == join_kind::LEFT_ANTI_JOIN) !=
-       (hash_table_view.contains(outer_row_index, hash_probe, equality)));
-    thread_counter += matches_per_row[outer_row_index];
-  }
-
-  using BlockReduce = cub::BlockReduce<cudf::size_type, block_size>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter);
-
-  // Add block counter to global counter
-  if (threadIdx.x == 0) {
-    cuda::atomic_ref<std::size_t, cuda::thread_scope_device> ref{*output_size};
-    ref.fetch_add(block_counter, cuda::std::memory_order_relaxed);
-  }
-}
-
-template __global__ void compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, true>(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_hash const hash_probe,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::semi_map_type::device_view hash_table_view,
-  ast::detail::expression_device_view device_expression_data,
-  bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row);
-
-template __global__ void compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, false>(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_hash const hash_probe,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::semi_map_type::device_view hash_table_view,
-  ast::detail::expression_device_view device_expression_data,
-  bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row);
-
-}  // namespace detail
-
-}  // namespace cudf
diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu
index cc37dadffd8..6c147c8a128 100644
--- a/cpp/tests/join/mixed_join_tests.cu
+++ b/cpp/tests/join/mixed_join_tests.cu
@@ -657,10 +657,6 @@ struct MixedJoinSingleReturnTest : public MixedJoinTest<T> {
                      std::vector<cudf::size_type> expected_outputs,
                      cudf::null_equality compare_nulls = cudf::null_equality::EQUAL)
   {
-    auto [result_size, actual_counts] = this->join_size(
-      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
-    EXPECT_TRUE(result_size == expected_outputs.size());
-
     auto result = this->join(
       left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
     std::vector<cudf::size_type> resulting_indices;
@@ -751,19 +747,6 @@ struct MixedJoinSingleReturnTest : public MixedJoinTest<T> {
                                 cudf::table_view right_conditional,
                                 cudf::ast::operation predicate,
                                 cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) = 0;
-
-  /**
-   * This method must be implemented by subclasses for specific types of joins.
-   * It should be a simply forwarding of arguments to the appropriate cudf
-   * mixed join size computation API.
-   */
-  virtual std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<cudf::size_type>>> join_size(
-    cudf::table_view left_equality,
-    cudf::table_view right_equality,
-    cudf::table_view left_conditional,
-    cudf::table_view right_conditional,
-    cudf::ast::operation predicate,
-    cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) = 0;
 };
 
 /**
@@ -781,18 +764,6 @@ struct MixedLeftSemiJoinTest : public MixedJoinSingleReturnTest<T> {
     return cudf::mixed_left_semi_join(
       left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
   }
-
-  std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<cudf::size_type>>> join_size(
-    cudf::table_view left_equality,
-    cudf::table_view right_equality,
-    cudf::table_view left_conditional,
-    cudf::table_view right_conditional,
-    cudf::ast::operation predicate,
-    cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override
-  {
-    return cudf::mixed_left_semi_join_size(
-      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
-  }
 };
 
 TYPED_TEST_SUITE(MixedLeftSemiJoinTest, cudf::test::IntegralTypesNotBool);
@@ -874,18 +845,6 @@ struct MixedLeftAntiJoinTest : public MixedJoinSingleReturnTest<T> {
     return cudf::mixed_left_anti_join(
       left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
   }
-
-  std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<cudf::size_type>>> join_size(
-    cudf::table_view left_equality,
-    cudf::table_view right_equality,
-    cudf::table_view left_conditional,
-    cudf::table_view right_conditional,
-    cudf::ast::operation predicate,
-    cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override
-  {
-    return cudf::mixed_left_anti_join_size(
-      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
-  }
 };
 
 TYPED_TEST_SUITE(MixedLeftAntiJoinTest, cudf::test::IntegralTypesNotBool);
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 5ce2f9d2d6e..4038b3a40b8 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -732,32 +732,14 @@ private static native long[] mixedFullJoinGatherMaps(long leftKeysTable, long ri
                                                        long leftConditionTable, long rightConditionTable,
                                                        long condition, boolean compareNullsEqual);
 
-  private static native long[] mixedLeftSemiJoinSize(long leftKeysTable, long rightKeysTable,
-                                                     long leftConditionTable, long rightConditionTable,
-                                                     long condition, boolean compareNullsEqual);
-
   private static native long[] mixedLeftSemiJoinGatherMap(long leftKeysTable, long rightKeysTable,
                                                           long leftConditionTable, long rightConditionTable,
                                                           long condition, boolean compareNullsEqual);
 
-  private static native long[] mixedLeftSemiJoinGatherMapWithSize(long leftKeysTable, long rightKeysTable,
-                                                                  long leftConditionTable, long rightConditionTable,
-                                                                  long condition, boolean compareNullsEqual,
-                                                                  long outputRowCount, long matchesColumnView);
-
-  private static native long[] mixedLeftAntiJoinSize(long leftKeysTable, long rightKeysTable,
-                                                     long leftConditionTable, long rightConditionTable,
-                                                     long condition, boolean compareNullsEqual);
-
   private static native long[] mixedLeftAntiJoinGatherMap(long leftKeysTable, long rightKeysTable,
                                                           long leftConditionTable, long rightConditionTable,
                                                           long condition, boolean compareNullsEqual);
 
-  private static native long[] mixedLeftAntiJoinGatherMapWithSize(long leftKeysTable, long rightKeysTable,
-                                                                  long leftConditionTable, long rightConditionTable,
-                                                                  long condition, boolean compareNullsEqual,
-                                                                  long outputRowCount, long matchesColumnView);
-
   private static native long[] crossJoin(long leftTable, long rightTable) throws CudfException;
 
   private static native long[] concatenate(long[] cudfTablePointers) throws CudfException;
@@ -3747,34 +3729,6 @@ public GatherMap conditionalLeftSemiJoinGatherMap(Table rightTable,
     return buildSingleJoinGatherMap(gatherMapData);
   }
 
-  /**
-   * Computes output size information for a left semi join between two tables using a mix of
-   * equality and inequality conditions. The entire join condition is assumed to be a logical AND
-   * of the equality condition and inequality condition.
-   * NOTE: It is the responsibility of the caller to close the resulting size information object
-   * or native resources can be leaked!
-   * @param leftKeys the left table's key columns for the equality condition
-   * @param rightKeys the right table's key columns for the equality condition
-   * @param leftConditional the left table's columns needed to evaluate the inequality condition
-   * @param rightConditional the right table's columns needed to evaluate the inequality condition
-   * @param condition the inequality condition of the join
-   * @param nullEquality whether nulls should compare as equal
-   * @return size information for the join
-   */
-  public static MixedJoinSize mixedLeftSemiJoinSize(Table leftKeys, Table rightKeys,
-                                                    Table leftConditional, Table rightConditional,
-                                                    CompiledExpression condition,
-                                                    NullEquality nullEquality) {
-    long[] mixedSizeInfo = mixedLeftSemiJoinSize(
-        leftKeys.getNativeView(), rightKeys.getNativeView(),
-        leftConditional.getNativeView(), rightConditional.getNativeView(),
-        condition.getNativeHandle(), nullEquality == NullEquality.EQUAL);
-    assert mixedSizeInfo.length == 2;
-    long outputRowCount = mixedSizeInfo[0];
-    long matchesColumnHandle = mixedSizeInfo[1];
-    return new MixedJoinSize(outputRowCount, new ColumnVector(matchesColumnHandle));
-  }
-
   /**
    * Computes the gather map that can be used to manifest the result of a left semi join between
    * two tables using a mix of equality and inequality conditions. The entire join condition is
@@ -3804,42 +3758,6 @@ public static GatherMap mixedLeftSemiJoinGatherMap(Table leftKeys, Table rightKe
     return buildSingleJoinGatherMap(gatherMapData);
   }
 
-  /**
-   * Computes the gather map that can be used to manifest the result of a left semi join between
-   * two tables using a mix of equality and inequality conditions. The entire join condition is
-   * assumed to be a logical AND of the equality condition and inequality condition.
-   * A {@link GatherMap} instance will be returned that can be used to gather
-   * the left table to produce the result of the left semi join.
-   *
-   * It is the responsibility of the caller to close the resulting gather map instances.
-   *
-   * This interface allows passing the size result from
-   * {@link #mixedLeftSemiJoinSize(Table, Table, Table, Table, CompiledExpression, NullEquality)}
-   * when the output size was computed previously.
-   *
-   * @param leftKeys the left table's key columns for the equality condition
-   * @param rightKeys the right table's key columns for the equality condition
-   * @param leftConditional the left table's columns needed to evaluate the inequality condition
-   * @param rightConditional the right table's columns needed to evaluate the inequality condition
-   * @param condition the inequality condition of the join
-   * @param nullEquality whether nulls should compare as equal
-   * @param joinSize mixed join size result
-   * @return left and right table gather maps
-   */
-  public static GatherMap mixedLeftSemiJoinGatherMap(Table leftKeys, Table rightKeys,
-                                                     Table leftConditional, Table rightConditional,
-                                                     CompiledExpression condition,
-                                                     NullEquality nullEquality,
-                                                     MixedJoinSize joinSize) {
-    long[] gatherMapData = mixedLeftSemiJoinGatherMapWithSize(
-        leftKeys.getNativeView(), rightKeys.getNativeView(),
-        leftConditional.getNativeView(), rightConditional.getNativeView(),
-        condition.getNativeHandle(),
-        nullEquality == NullEquality.EQUAL,
-        joinSize.getOutputRowCount(), joinSize.getMatches().getNativeView());
-    return buildSingleJoinGatherMap(gatherMapData);
-  }
-
   /**
    * Computes the gather map that can be used to manifest the result of a left anti-join between
    * two tables. It is assumed this table instance holds the key columns from the left table, and
@@ -3919,34 +3837,6 @@ public GatherMap conditionalLeftAntiJoinGatherMap(Table rightTable,
     return buildSingleJoinGatherMap(gatherMapData);
   }
 
-  /**
-   * Computes output size information for a left anti join between two tables using a mix of
-   * equality and inequality conditions. The entire join condition is assumed to be a logical AND
-   * of the equality condition and inequality condition.
-   * NOTE: It is the responsibility of the caller to close the resulting size information object
-   * or native resources can be leaked!
-   * @param leftKeys the left table's key columns for the equality condition
-   * @param rightKeys the right table's key columns for the equality condition
-   * @param leftConditional the left table's columns needed to evaluate the inequality condition
-   * @param rightConditional the right table's columns needed to evaluate the inequality condition
-   * @param condition the inequality condition of the join
-   * @param nullEquality whether nulls should compare as equal
-   * @return size information for the join
-   */
-  public static MixedJoinSize mixedLeftAntiJoinSize(Table leftKeys, Table rightKeys,
-                                                    Table leftConditional, Table rightConditional,
-                                                    CompiledExpression condition,
-                                                    NullEquality nullEquality) {
-    long[] mixedSizeInfo = mixedLeftAntiJoinSize(
-        leftKeys.getNativeView(), rightKeys.getNativeView(),
-        leftConditional.getNativeView(), rightConditional.getNativeView(),
-        condition.getNativeHandle(), nullEquality == NullEquality.EQUAL);
-    assert mixedSizeInfo.length == 2;
-    long outputRowCount = mixedSizeInfo[0];
-    long matchesColumnHandle = mixedSizeInfo[1];
-    return new MixedJoinSize(outputRowCount, new ColumnVector(matchesColumnHandle));
-  }
-
   /**
    * Computes the gather map that can be used to manifest the result of a left anti join between
    * two tables using a mix of equality and inequality conditions. The entire join condition is
@@ -3976,42 +3866,6 @@ public static GatherMap mixedLeftAntiJoinGatherMap(Table leftKeys, Table rightKe
     return buildSingleJoinGatherMap(gatherMapData);
   }
 
-  /**
-   * Computes the gather map that can be used to manifest the result of a left anti join between
-   * two tables using a mix of equality and inequality conditions. The entire join condition is
-   * assumed to be a logical AND of the equality condition and inequality condition.
-   * A {@link GatherMap} instance will be returned that can be used to gather
-   * the left table to produce the result of the left anti join.
-   *
-   * It is the responsibility of the caller to close the resulting gather map instances.
-   *
-   * This interface allows passing the size result from
-   * {@link #mixedLeftAntiJoinSize(Table, Table, Table, Table, CompiledExpression, NullEquality)}
-   * when the output size was computed previously.
-   *
-   * @param leftKeys the left table's key columns for the equality condition
-   * @param rightKeys the right table's key columns for the equality condition
-   * @param leftConditional the left table's columns needed to evaluate the inequality condition
-   * @param rightConditional the right table's columns needed to evaluate the inequality condition
-   * @param condition the inequality condition of the join
-   * @param nullEquality whether nulls should compare as equal
-   * @param joinSize mixed join size result
-   * @return left and right table gather maps
-   */
-  public static GatherMap mixedLeftAntiJoinGatherMap(Table leftKeys, Table rightKeys,
-                                                     Table leftConditional, Table rightConditional,
-                                                     CompiledExpression condition,
-                                                     NullEquality nullEquality,
-                                                     MixedJoinSize joinSize) {
-    long[] gatherMapData = mixedLeftAntiJoinGatherMapWithSize(
-        leftKeys.getNativeView(), rightKeys.getNativeView(),
-        leftConditional.getNativeView(), rightConditional.getNativeView(),
-        condition.getNativeHandle(),
-        nullEquality == NullEquality.EQUAL,
-        joinSize.getOutputRowCount(), joinSize.getMatches().getNativeView());
-    return buildSingleJoinGatherMap(gatherMapData);
-  }
-
   /**
    * Construct a table from a packed representation.
    * @param metadata host-based metadata for the table
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 51b8eb853de..e8616710217 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -2838,20 +2838,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinGa
       });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftSemiJoinSize(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
-  return cudf::jni::mixed_join_size(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_semi_join_size(left_keys, right_keys, left_condition,
-                                               right_condition, condition, nulls_equal);
-      });
-}
-
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftSemiJoinGatherMap(
     JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
     jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
@@ -2866,22 +2852,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftSemiJoinGatherMa
       });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftSemiJoinGatherMapWithSize(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal, jlong j_output_row_count,
-    jlong j_matches_view) {
-  auto size_info = cudf::jni::get_mixed_size_info(env, j_output_row_count, j_matches_view);
-  return cudf::jni::mixed_join_gather_single_map(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [&size_info](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-                   cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-                   cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_semi_join(left_keys, right_keys, left_condition, right_condition,
-                                          condition, nulls_equal, size_info);
-      });
-}
-
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftAntiJoinGatherMap(
     JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
   return cudf::jni::join_gather_single_map(
@@ -2930,20 +2900,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinGa
       });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftAntiJoinSize(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
-  return cudf::jni::mixed_join_size(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_anti_join_size(left_keys, right_keys, left_condition,
-                                               right_condition, condition, nulls_equal);
-      });
-}
-
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftAntiJoinGatherMap(
     JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
     jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
@@ -2958,22 +2914,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftAntiJoinGatherMa
       });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftAntiJoinGatherMapWithSize(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal, jlong j_output_row_count,
-    jlong j_matches_view) {
-  auto size_info = cudf::jni::get_mixed_size_info(env, j_output_row_count, j_matches_view);
-  return cudf::jni::mixed_join_gather_single_map(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [&size_info](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-                   cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-                   cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_anti_join(left_keys, right_keys, left_condition, right_condition,
-                                          condition, nulls_equal, size_info);
-      });
-}
-
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_crossJoin(JNIEnv *env, jclass,
                                                                  jlong left_table,
                                                                  jlong right_table) {
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 30905783c7f..8560a9caad7 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -3058,64 +3058,6 @@ void testMixedLeftSemiJoinGatherMapNulls() {
     }
   }
 
-  @Test
-  void testMixedLeftSemiJoinGatherMapWithSize() {
-    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
-        new ColumnReference(1, TableReference.LEFT),
-        new ColumnReference(1, TableReference.RIGHT));
-    try (CompiledExpression condition = expr.compile();
-         Table left = new Table.TestBuilder()
-             .column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8)
-             .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 0)
-             .build();
-         Table leftKeys = new Table(left.getColumn(0));
-         Table right = new Table.TestBuilder()
-             .column(6, 5, 9, 8, 10, 32)
-             .column(0, 1, 2, 3, 4, 5)
-             .column(7, 8, 9, 0, 1, 2).build();
-         Table rightKeys = new Table(right.getColumn(0));
-         Table expected = new Table.TestBuilder()
-             .column(2, 7, 8)
-             .build();
-         MixedJoinSize sizeInfo = Table.mixedLeftSemiJoinSize(leftKeys, rightKeys, left, right,
-             condition, NullEquality.UNEQUAL)) {
-      assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount());
-      try (GatherMap map = Table.mixedLeftSemiJoinGatherMap(leftKeys, rightKeys, left, right,
-          condition, NullEquality.UNEQUAL, sizeInfo)) {
-        verifySemiJoinGatherMap(map, expected);
-      }
-    }
-  }
-
-  @Test
-  void testMixedLeftSemiJoinGatherMapNullsWithSize() {
-    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
-        new ColumnReference(1, TableReference.LEFT),
-        new ColumnReference(1, TableReference.RIGHT));
-    try (CompiledExpression condition = expr.compile();
-         Table left = new Table.TestBuilder()
-             .column(null, 3, 9, 0, 1, 7, 4, null, 5, 8)
-             .column(   1, 2, 3, 4, 5, 6, 7,    8, 9, 0)
-             .build();
-         Table leftKeys = new Table(left.getColumn(0));
-         Table right = new Table.TestBuilder()
-             .column(null, 5, null, 8, 10, 32)
-             .column(   0, 1,    2, 3,  4,  5)
-             .column(   7, 8,    9, 0,  1,  2).build();
-         Table rightKeys = new Table(right.getColumn(0));
-         Table expected = new Table.TestBuilder()
-             .column(0, 7, 8)
-             .build();
-         MixedJoinSize sizeInfo = Table.mixedLeftSemiJoinSize(leftKeys, rightKeys, left, right,
-             condition, NullEquality.EQUAL)) {
-      assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount());
-      try (GatherMap map = Table.mixedLeftSemiJoinGatherMap(leftKeys, rightKeys, left, right,
-          condition, NullEquality.EQUAL, sizeInfo)) {
-        verifySemiJoinGatherMap(map, expected);
-      }
-    }
-  }
-
   @Test
   void testMixedLeftAntiJoinGatherMap() {
     BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
@@ -3166,64 +3108,6 @@ void testMixedLeftAntiJoinGatherMapNulls() {
     }
   }
 
-  @Test
-  void testMixedLeftAntiJoinGatherMapWithSize() {
-    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
-        new ColumnReference(1, TableReference.LEFT),
-        new ColumnReference(1, TableReference.RIGHT));
-    try (CompiledExpression condition = expr.compile();
-         Table left = new Table.TestBuilder()
-             .column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8)
-             .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 0)
-             .build();
-         Table leftKeys = new Table(left.getColumn(0));
-         Table right = new Table.TestBuilder()
-             .column(6, 5, 9, 8, 10, 32)
-             .column(0, 1, 2, 3, 4, 5)
-             .column(7, 8, 9, 0, 1, 2).build();
-         Table rightKeys = new Table(right.getColumn(0));
-         Table expected = new Table.TestBuilder()
-             .column(0, 1, 3, 4, 5, 6, 9)
-             .build();
-         MixedJoinSize sizeInfo = Table.mixedLeftAntiJoinSize(leftKeys, rightKeys, left, right,
-             condition, NullEquality.UNEQUAL)) {
-      assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount());
-      try (GatherMap map = Table.mixedLeftAntiJoinGatherMap(leftKeys, rightKeys, left, right,
-          condition, NullEquality.UNEQUAL, sizeInfo)) {
-        verifySemiJoinGatherMap(map, expected);
-      }
-    }
-  }
-
-  @Test
-  void testMixedLeftAntiJoinGatherMapNullsWithSize() {
-    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
-        new ColumnReference(1, TableReference.LEFT),
-        new ColumnReference(1, TableReference.RIGHT));
-    try (CompiledExpression condition = expr.compile();
-         Table left = new Table.TestBuilder()
-             .column(null, 3, 9, 0, 1, 7, 4, null, 5, 8)
-             .column(   1, 2, 3, 4, 5, 6, 7,    8, 9, 0)
-             .build();
-         Table leftKeys = new Table(left.getColumn(0));
-         Table right = new Table.TestBuilder()
-             .column(null, 5, null, 8, 10, 32)
-             .column(   0, 1,    2, 3,  4,  5)
-             .column(   7, 8,    9, 0,  1,  2).build();
-         Table rightKeys = new Table(right.getColumn(0));
-         Table expected = new Table.TestBuilder()
-             .column(1, 2, 3, 4, 5, 6, 9)
-             .build();
-         MixedJoinSize sizeInfo = Table.mixedLeftAntiJoinSize(leftKeys, rightKeys, left, right,
-             condition, NullEquality.EQUAL)) {
-      assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount());
-      try (GatherMap map = Table.mixedLeftAntiJoinGatherMap(leftKeys, rightKeys, left, right,
-          condition, NullEquality.EQUAL, sizeInfo)) {
-        verifySemiJoinGatherMap(map, expected);
-      }
-    }
-  }
-
   @Test
   void testLeftSemiJoinGatherMap() {
     try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();

From 61dbfe8dc7635264465ce46d7de9e87ca0353267 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 4 Apr 2024 15:22:48 -0400
Subject: [PATCH 026/272] Allow jit compilation when using a splayed CUDA
 toolkit (#15451)

The `JitifyPreprocessKernels.cmake` module now handles when `CUDAToolkit_INCLUDE_DIRS` has multiple values correctly, allowing for compilation with splayed CUDA Toolkit installs.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15451
---
 cpp/cmake/Modules/JitifyPreprocessKernels.cmake | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
index 8c4e2b47fca..752c2028350 100644
--- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
+++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
@@ -23,8 +23,9 @@ target_link_libraries(jitify_preprocess PUBLIC ${CMAKE_DL_LIBS})
 function(jit_preprocess_files)
   cmake_parse_arguments(ARG "" "SOURCE_DIRECTORY" "FILES" ${ARGN})
 
-  foreach(inc IN LISTS libcudacxx_raw_includes)
-    list(APPEND libcudacxx_includes "-I${inc}")
+  set(includes)
+  foreach(inc IN LISTS libcudacxx_raw_includes CUDAToolkit_INCLUDE_DIRS)
+    list(APPEND includes "-I${inc}")
   endforeach()
   foreach(ARG_FILE ${ARG_FILES})
     set(ARG_OUTPUT ${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files/${ARG_FILE}.jit.hpp)
@@ -44,8 +45,7 @@ function(jit_preprocess_files)
         $<TARGET_FILE:jitify_preprocess> ${ARG_FILE} -o
         ${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files -i -m -std=c++17
         -remove-unused-globals -D_FILE_OFFSET_BITS=64 -D__CUDACC_RTC__ -I${CUDF_SOURCE_DIR}/include
-        -I${CUDF_SOURCE_DIR}/src ${libcudacxx_includes} -I${CUDAToolkit_INCLUDE_DIRS}
-        --no-preinclude-workarounds --no-replace-pragma-once
+        -I${CUDF_SOURCE_DIR}/src ${includes} --no-preinclude-workarounds --no-replace-pragma-once
       COMMENT "Custom command to JIT-compile files."
     )
   endforeach()

From c0f84bf5bbc7262015c42588fc1f4fd2b8e1b6c1 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 4 Apr 2024 15:24:04 -0400
Subject: [PATCH 027/272] Allow consumers of static builds to find nanoarrow
 (#15456)

Allows consumers like spark-rapids to bring in libcudf static builds from the install and build trees.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15456
---
 cpp/cmake/thirdparty/get_nanoarrow.cmake      |  1 +
 .../thirdparty/patches/nanoarrow_cmake.diff   | 39 +++++++++++++++----
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake
index 4316db99a8d..884e5a2f368 100644
--- a/cpp/cmake/thirdparty/get_nanoarrow.cmake
+++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake
@@ -49,6 +49,7 @@ function(find_and_configure_nanoarrow)
     OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf"
   )
   set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)
+  rapids_export_find_package_root(BUILD nanoarrow "${nanoarrow_BINARY_DIR}" EXPORT_SET cudf-exports)
 endfunction()
 
 find_and_configure_nanoarrow(
diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff b/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff
index b53e134ed2c..1262a38c0a4 100644
--- a/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff
+++ b/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff
@@ -1,5 +1,5 @@
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 8714c70..1feec13 100644
+index 8714c70..6a9e505 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
 @@ -49,7 +49,6 @@ else()
@@ -10,7 +10,15 @@ index 8714c70..1feec13 100644
 
  # Avoids a warning about timestamps on downloaded files (prefer new policy
  # if available))
-@@ -111,6 +110,8 @@ if(NANOARROW_BUNDLE)
+@@ -59,6 +58,7 @@ endif()
+
+ configure_file(src/nanoarrow/nanoarrow_config.h.in generated/nanoarrow_config.h)
+
++include(GNUInstallDirs)
+ if(NANOARROW_BUNDLE)
+   # Combine all headers into amalgamation/nanoarrow.h in the build directory
+   file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/amalgamation)
+@@ -111,6 +111,8 @@ if(NANOARROW_BUNDLE)
    if(NANOARROW_BUILD_TESTS)
      include_directories(${CMAKE_BINARY_DIR}/amalgamation)
      add_library(nanoarrow ${NANOARROW_C_TEMP})
@@ -19,7 +27,7 @@ index 8714c70..1feec13 100644
      target_compile_definitions(nanoarrow PUBLIC "$<$<CONFIG:Debug>:NANOARROW_DEBUG>")
    endif()
 
-@@ -120,6 +121,7 @@ if(NANOARROW_BUNDLE)
+@@ -120,10 +122,11 @@ if(NANOARROW_BUNDLE)
  else()
    add_library(nanoarrow src/nanoarrow/array.c src/nanoarrow/schema.c
                          src/nanoarrow/array_stream.c src/nanoarrow/utils.c)
@@ -27,25 +35,31 @@ index 8714c70..1feec13 100644
 
    target_include_directories(nanoarrow
                               PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src>
-@@ -154,13 +156,50 @@ else()
+-                                    $<INSTALL_INTERFACE:include>)
++                                    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+   target_include_directories(nanoarrow
+                              PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/generated>
+   )
+@@ -154,13 +157,49 @@ else()
      endif()
    endif()
 
 -  install(TARGETS nanoarrow DESTINATION lib)
 +  install(TARGETS nanoarrow
-+          DESTINATION lib
++          DESTINATION "${CMAKE_INSTALL_LIBDIR}"
 +          EXPORT nanoarrow-exports)
    install(DIRECTORY src/
-           DESTINATION include
+-          DESTINATION include
++          DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
            FILES_MATCHING
 -          PATTERN "*.h")
 +          PATTERN "*.h*")
    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/generated/nanoarrow_config.h
-           DESTINATION include/nanoarrow)
+-          DESTINATION include/nanoarrow)
++          DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nanoarrow")
 +
 +  # Generate package files for the build and install trees.
 +  include(CMakePackageConfigHelpers)
-+  include(GNUInstallDirs)
 +
 +  foreach(tree_type BUILD INSTALL)
 +    if(tree_type STREQUAL "BUILD")
@@ -80,6 +94,15 @@ index 8714c70..1feec13 100644
  endif()
 
  # Always build integration test if building tests
+@@ -171,7 +210,7 @@ if(NANOARROW_BUILD_TESTS OR NANOARROW_BUILD_INTEGRATION_TESTS)
+               src/nanoarrow/integration/c_data_integration.cc)
+   target_include_directories(nanoarrow_c_data_integration
+                              PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src>
+-                                    $<INSTALL_INTERFACE:include>)
++                                    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+   target_link_libraries(nanoarrow_c_data_integration PRIVATE nanoarrow nlohmann_json)
+ endif()
+
 @@ -215,34 +254,18 @@ if(NANOARROW_BUILD_TESTS)
                   src/nanoarrow/integration/c_data_integration_test.cc)
 

From 8509054861f57379524982cc70db294d85a0dc5c Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 4 Apr 2024 16:09:45 -0400
Subject: [PATCH 028/272] Remove deprecated hash() and
 spark_murmurhash3_x86_32() (#15375)

Remove deprecated libcudf hash functions. The `cudf::hash()` and `cudf::hashing::spark_murmurhash3_x86_32()` were deprecated in previous releases. The `cudf::hash_partition()` function still relies on the enum `hash_id` so it has been moved from `hashing.cpp` to `partitioning.hpp`.
Calls to `cudf::hashing::spark_murmurhash3_x86_32()` were also removed from the JNI code.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - https://github.com/nvdbaranec
  - Jason Lowe (https://github.com/jlowe)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15375
---
 cpp/CMakeLists.txt                            |   2 -
 cpp/include/cudf/hashing.hpp                  |  52 --
 cpp/include/cudf/hashing/detail/hashing.hpp   |   5 -
 cpp/include/cudf/partitioning.hpp             |  10 +-
 cpp/src/hash/hashing.cu                       |  53 --
 cpp/src/hash/spark_murmurhash3_x86_32.cu      | 442 --------------
 .../hashing/spark_murmurhash3_x86_32_test.cpp | 576 ------------------
 .../partitioning/hash_partition_test.cpp      |  15 -
 .../java/ai/rapids/cudf/ColumnVector.java     |  44 +-
 .../main/java/ai/rapids/cudf/HashType.java    |   6 +-
 java/src/main/native/src/ColumnVectorJni.cpp  |  10 +-
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 219 -------
 12 files changed, 18 insertions(+), 1416 deletions(-)
 delete mode 100644 cpp/src/hash/hashing.cu
 delete mode 100644 cpp/src/hash/spark_murmurhash3_x86_32.cu
 delete mode 100644 cpp/tests/hashing/spark_murmurhash3_x86_32_test.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7c32474ea56..7d62e0acb10 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -346,7 +346,6 @@ add_library(
   src/groupby/sort/group_replace_nulls.cu
   src/groupby/sort/group_sum_scan.cu
   src/groupby/sort/sort_helper.cu
-  src/hash/hashing.cu
   src/hash/md5_hash.cu
   src/hash/murmurhash3_x86_32.cu
   src/hash/murmurhash3_x64_128.cu
@@ -355,7 +354,6 @@ add_library(
   src/hash/sha256_hash.cu
   src/hash/sha384_hash.cu
   src/hash/sha512_hash.cu
-  src/hash/spark_murmurhash3_x86_32.cu
   src/hash/xxhash_64.cu
   src/interop/dlpack.cpp
   src/interop/from_arrow.cu
diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp
index 64a78da1803..83962b50a10 100644
--- a/cpp/include/cudf/hashing.hpp
+++ b/cpp/include/cudf/hashing.hpp
@@ -34,42 +34,11 @@ namespace cudf {
  */
 using hash_value_type = uint32_t;
 
-/**
- * @brief Identifies the hash function to be used
- *
- */
-enum class hash_id {
-  HASH_IDENTITY = 0,   ///< Identity hash function that simply returns the key to be hashed
-  HASH_MURMUR3,        ///< Murmur3 hash function
-  HASH_SPARK_MURMUR3,  ///< Spark Murmur3 hash function
-  HASH_MD5             ///< MD5 hash function
-};
-
 /**
  * @brief The default seed value for hash functions
  */
 static constexpr uint32_t DEFAULT_HASH_SEED = 0;
 
-/**
- * @brief Computes the hash value of each row in the input set of columns.
- *
- * @deprecated Since 23.08
- *
- * @param input The table of columns to hash
- * @param hash_function The hash function enum to use
- * @param seed Optional seed value to use for the hash function
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- *
- * @returns A column where each row is the hash of a column from the input
- */
-[[deprecated]] std::unique_ptr<column> hash(
-  table_view const& input,
-  hash_id hash_function               = hash_id::HASH_MURMUR3,
-  uint32_t seed                       = DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 //! Hash APIs
 namespace hashing {
 
@@ -112,27 +81,6 @@ std::unique_ptr<table> murmurhash3_x64_128(
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Computes the MurmurHash3 32-bit hash value of each row in the given table
- *
- * @deprecated Since 24.04
- *
- * This function computes the hash similar to MurmurHash3_x86_32 with special processing
- * to match Spark's implementation results.
- *
- * @param input The table of columns to hash
- * @param seed Optional seed value to use for the hash function
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- *
- * @returns A column where each row is the hash of a row from the input
- */
-[[deprecated]] std::unique_ptr<column> spark_murmurhash3_x86_32(
-  table_view const& input,
-  uint32_t seed                       = DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 /**
  * @brief Computes the MD5 hash value of each row in the given table
  *
diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp
index eaeb5d6b068..88a43a64638 100644
--- a/cpp/include/cudf/hashing/detail/hashing.hpp
+++ b/cpp/include/cudf/hashing/detail/hashing.hpp
@@ -37,11 +37,6 @@ std::unique_ptr<table> murmurhash3_x64_128(table_view const& input,
                                            rmm::cuda_stream_view,
                                            rmm::mr::device_memory_resource* mr);
 
-std::unique_ptr<column> spark_murmurhash3_x86_32(table_view const& input,
-                                                 uint32_t seed,
-                                                 rmm::cuda_stream_view,
-                                                 rmm::mr::device_memory_resource* mr);
-
 std::unique_ptr<column> md5(table_view const& input,
                             rmm::cuda_stream_view stream,
                             rmm::mr::device_memory_resource* mr);
diff --git a/cpp/include/cudf/partitioning.hpp b/cpp/include/cudf/partitioning.hpp
index 2c91bdf64f5..7033aa500a2 100644
--- a/cpp/include/cudf/partitioning.hpp
+++ b/cpp/include/cudf/partitioning.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,6 +33,14 @@ namespace cudf {
  * @brief Column partitioning APIs
  */
 
+/**
+ * @brief Identifies the hash function to be used in hash partitioning
+ */
+enum class hash_id {
+  HASH_IDENTITY = 0,  ///< Identity hash function that simply returns the key to be hashed
+  HASH_MURMUR3        ///< Murmur3 hash function
+};
+
 /**
  * @brief Partitions rows of `t` according to the mapping specified by
  * `partition_map`.
diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu
deleted file mode 100644
index 68e02ef3cf4..00000000000
--- a/cpp/src/hash/hashing.cu
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/hashing/detail/hashing.hpp>
-#include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-
-namespace cudf {
-namespace hashing {
-namespace detail {
-
-std::unique_ptr<column> hash(table_view const& input,
-                             hash_id hash_function,
-                             uint32_t seed,
-                             rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
-{
-  switch (hash_function) {
-    case (hash_id::HASH_MURMUR3): return murmurhash3_x86_32(input, seed, stream, mr);
-    case (hash_id::HASH_SPARK_MURMUR3): return spark_murmurhash3_x86_32(input, seed, stream, mr);
-    case (hash_id::HASH_MD5): return md5(input, stream, mr);
-    default: CUDF_FAIL("Unsupported hash function.");
-  }
-}
-
-}  // namespace detail
-}  // namespace hashing
-
-std::unique_ptr<column> hash(table_view const& input,
-                             hash_id hash_function,
-                             uint32_t seed,
-                             rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return hashing::detail::hash(input, hash_function, seed, stream, mr);
-}
-
-}  // namespace cudf
diff --git a/cpp/src/hash/spark_murmurhash3_x86_32.cu b/cpp/src/hash/spark_murmurhash3_x86_32.cu
deleted file mode 100644
index c7992b4afa0..00000000000
--- a/cpp/src/hash/spark_murmurhash3_x86_32.cu
+++ /dev/null
@@ -1,442 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/hashing/detail/hash_functions.cuh>
-#include <cudf/hashing/detail/hashing.hpp>
-#include <cudf/table/experimental/row_operators.cuh>
-#include <cudf/table/table_device_view.cuh>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/tabulate.h>
-
-namespace cudf {
-namespace hashing {
-namespace detail {
-
-namespace {
-
-using spark_hash_value_type = int32_t;
-
-template <typename Key, CUDF_ENABLE_IF(not cudf::is_nested<Key>())>
-struct Spark_MurmurHash3_x86_32 {
-  using result_type = spark_hash_value_type;
-
-  constexpr Spark_MurmurHash3_x86_32() = default;
-  constexpr Spark_MurmurHash3_x86_32(uint32_t seed) : m_seed(seed) {}
-
-  [[nodiscard]] __device__ inline uint32_t fmix32(uint32_t h) const
-  {
-    h ^= h >> 16;
-    h *= 0x85ebca6b;
-    h ^= h >> 13;
-    h *= 0xc2b2ae35;
-    h ^= h >> 16;
-    return h;
-  }
-
-  [[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data,
-                                                      cudf::size_type offset) const
-  {
-    // Read a 4-byte value from the data pointer as individual bytes for safe
-    // unaligned access (very likely for string types).
-    auto block = reinterpret_cast<uint8_t const*>(data + offset);
-    return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
-  }
-
-  [[nodiscard]] result_type __device__ inline operator()(Key const& key) const
-  {
-    return compute(key);
-  }
-
-  template <typename T>
-  result_type __device__ inline compute(T const& key) const
-  {
-    return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(T));
-  }
-
-  result_type __device__ inline compute_remaining_bytes(std::byte const* data,
-                                                        cudf::size_type len,
-                                                        cudf::size_type tail_offset,
-                                                        result_type h) const
-  {
-    // Process remaining bytes that do not fill a four-byte chunk using Spark's approach
-    // (does not conform to normal MurmurHash3).
-    for (auto i = tail_offset; i < len; i++) {
-      // We require a two-step cast to get the k1 value from the byte. First,
-      // we must cast to a signed int8_t. Then, the sign bit is preserved when
-      // casting to uint32_t under 2's complement. Java preserves the sign when
-      // casting byte-to-int, but C++ does not.
-      uint32_t k1 = static_cast<uint32_t>(std::to_integer<int8_t>(data[i]));
-      k1 *= c1;
-      k1 = rotate_bits_left(k1, rot_c1);
-      k1 *= c2;
-      h ^= k1;
-      h = rotate_bits_left(static_cast<uint32_t>(h), rot_c2);
-      h = h * 5 + c3;
-    }
-    return h;
-  }
-
-  result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const
-  {
-    constexpr cudf::size_type BLOCK_SIZE = 4;
-    cudf::size_type const nblocks        = len / BLOCK_SIZE;
-    cudf::size_type const tail_offset    = nblocks * BLOCK_SIZE;
-    result_type h                        = m_seed;
-
-    // Process all four-byte chunks.
-    for (cudf::size_type i = 0; i < nblocks; i++) {
-      uint32_t k1 = getblock32(data, i * BLOCK_SIZE);
-      k1 *= c1;
-      k1 = rotate_bits_left(k1, rot_c1);
-      k1 *= c2;
-      h ^= k1;
-      h = rotate_bits_left(static_cast<uint32_t>(h), rot_c2);
-      h = h * 5 + c3;
-    }
-
-    h = compute_remaining_bytes(data, len, tail_offset, h);
-
-    // Finalize hash.
-    h ^= len;
-    h = fmix32(h);
-    return h;
-  }
-
- private:
-  uint32_t m_seed{cudf::DEFAULT_HASH_SEED};
-  static constexpr uint32_t c1     = 0xcc9e2d51;
-  static constexpr uint32_t c2     = 0x1b873593;
-  static constexpr uint32_t c3     = 0xe6546b64;
-  static constexpr uint32_t rot_c1 = 15;
-  static constexpr uint32_t rot_c2 = 13;
-};
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<bool>::operator()(
-  bool const& key) const
-{
-  return compute<uint32_t>(key);
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<int8_t>::operator()(
-  int8_t const& key) const
-{
-  return compute<uint32_t>(key);
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<uint8_t>::operator()(
-  uint8_t const& key) const
-{
-  return compute<uint32_t>(key);
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<int16_t>::operator()(
-  int16_t const& key) const
-{
-  return compute<uint32_t>(key);
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<uint16_t>::operator()(
-  uint16_t const& key) const
-{
-  return compute<uint32_t>(key);
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<float>::operator()(
-  float const& key) const
-{
-  return compute<float>(normalize_nans(key));
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<double>::operator()(
-  double const& key) const
-{
-  return compute<double>(normalize_nans(key));
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<cudf::string_view>::operator()(
-  cudf::string_view const& key) const
-{
-  auto const data = reinterpret_cast<std::byte const*>(key.data());
-  auto const len  = key.size_bytes();
-  return compute_bytes(data, len);
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<numeric::decimal32>::operator()(
-  numeric::decimal32 const& key) const
-{
-  return compute<uint64_t>(key.value());
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<numeric::decimal64>::operator()(
-  numeric::decimal64 const& key) const
-{
-  return compute<uint64_t>(key.value());
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<numeric::decimal128>::operator()(
-  numeric::decimal128 const& key) const
-{
-  // Generates the Spark MurmurHash3 hash value, mimicking the conversion:
-  // java.math.BigDecimal.valueOf(unscaled_value, _scale).unscaledValue().toByteArray()
-  // https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala#L381
-  __int128_t const val               = key.value();
-  constexpr cudf::size_type key_size = sizeof(__int128_t);
-  std::byte const* data              = reinterpret_cast<std::byte const*>(&val);
-
-  // Small negative values start with 0xff..., small positive values start with 0x00...
-  bool const is_negative     = val < 0;
-  std::byte const zero_value = is_negative ? std::byte{0xff} : std::byte{0x00};
-
-  // If the value can be represented with a shorter than 16-byte integer, the
-  // leading bytes of the little-endian value are truncated and are not hashed.
-  auto const reverse_begin = thrust::reverse_iterator(data + key_size);
-  auto const reverse_end   = thrust::reverse_iterator(data);
-  auto const first_nonzero_byte =
-    thrust::find_if_not(thrust::seq, reverse_begin, reverse_end, [zero_value](std::byte const& v) {
-      return v == zero_value;
-    }).base();
-  // Max handles special case of 0 and -1 which would shorten to 0 length otherwise
-  cudf::size_type length =
-    std::max(1, static_cast<cudf::size_type>(thrust::distance(data, first_nonzero_byte)));
-
-  // Preserve the 2's complement sign bit by adding a byte back on if necessary.
-  // e.g. 0x0000ff would shorten to 0x00ff. The 0x00 byte is retained to
-  // preserve the sign bit, rather than leaving an "f" at the front which would
-  // change the sign bit. However, 0x00007f would shorten to 0x7f. No extra byte
-  // is needed because the leftmost bit matches the sign bit. Similarly for
-  // negative values: 0xffff00 --> 0xff00 and 0xffff80 --> 0x80.
-  if ((length < key_size) && (is_negative ^ bool(data[length - 1] & std::byte{0x80}))) { ++length; }
-
-  // Convert to big endian by reversing the range of nonzero bytes. Only those bytes are hashed.
-  __int128_t big_endian_value = 0;
-  auto big_endian_data        = reinterpret_cast<std::byte*>(&big_endian_value);
-  thrust::reverse_copy(thrust::seq, data, data + length, big_endian_data);
-  return compute_bytes(big_endian_data, length);
-}
-
-/**
- * @brief Computes the hash value of a row in the given table.
- *
- * This functor uses Spark conventions for Murmur hashing, which differs from
- * the Murmur implementation used in the rest of libcudf. These differences
- * include:
- * - Serially using the output hash as an input seed for the next item
- * - Ignorance of null values
- *
- * The serial use of hashes as seeds means that data of different nested types
- * can exhibit hash collisions. For example, a row of an integer column
- * containing a 1 will have the same hash as a lists column of integers
- * containing a list of [1] and a struct column of a single integer column
- * containing a struct of {1}.
- *
- * As a consequence of ignoring null values, inputs like [1], [1, null], and
- * [null, 1] have the same hash (an expected hash collision). This kind of
- * collision can also occur across a table of nullable columns and with nulls
- * in structs ({1, null} and {null, 1} have the same hash). The seed value (the
- * previous element's hash value) is returned as the hash if an element is
- * null.
- *
- * For additional differences such as special tail processing and decimal type
- * handling, refer to the Spark_MurmurHash3_x86_32 functor.
- *
- * @tparam hash_function Hash functor to use for hashing elements. Must be Spark_MurmurHash3_x86_32.
- * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
- */
-template <template <typename> class hash_function, typename Nullate>
-class spark_murmur_device_row_hasher {
-  friend class cudf::experimental::row::hash::row_hasher;  ///< Allow row_hasher to access private
-                                                           ///< members.
-
- public:
-  /**
-   * @brief Return the hash value of a row in the given table.
-   *
-   * @param row_index The row index to compute the hash value of
-   * @return The hash value of the row
-   */
-  __device__ auto operator()(size_type row_index) const noexcept
-  {
-    return cudf::detail::accumulate(
-      _table.begin(),
-      _table.end(),
-      _seed,
-      [row_index, nulls = this->_check_nulls] __device__(auto hash, auto column) {
-        return cudf::type_dispatcher(
-          column.type(), element_hasher_adapter<hash_function>{nulls, hash}, column, row_index);
-      });
-  }
-
- private:
-  /**
-   * @brief Computes the hash value of an element in the given column.
-   *
-   * When the column is non-nested, this is a simple wrapper around the element_hasher.
-   * When the column is nested, this uses a seed value to serially compute each
-   * nested element, with the output hash becoming the seed for the next value.
-   * This requires constructing a new hash functor for each nested element,
-   * using the new seed from the previous element's hash. The hash of a null
-   * element is the input seed (the previous element's hash).
-   */
-  template <template <typename> class hash_fn>
-  class element_hasher_adapter {
-   public:
-    __device__ element_hasher_adapter(Nullate check_nulls, uint32_t seed) noexcept
-      : _check_nulls(check_nulls), _seed(seed)
-    {
-    }
-
-    using hash_functor = cudf::experimental::row::hash::element_hasher<hash_fn, Nullate>;
-
-    template <typename T, CUDF_ENABLE_IF(not cudf::is_nested<T>())>
-    __device__ spark_hash_value_type operator()(column_device_view const& col,
-                                                size_type row_index) const noexcept
-    {
-      auto const hasher = hash_functor{_check_nulls, _seed, _seed};
-      return hasher.template operator()<T>(col, row_index);
-    }
-
-    template <typename T, CUDF_ENABLE_IF(cudf::is_nested<T>())>
-    __device__ spark_hash_value_type operator()(column_device_view const& col,
-                                                size_type row_index) const noexcept
-    {
-      column_device_view curr_col = col.slice(row_index, 1);
-      while (curr_col.type().id() == type_id::STRUCT || curr_col.type().id() == type_id::LIST) {
-        if (curr_col.type().id() == type_id::STRUCT) {
-          if (curr_col.num_child_columns() == 0) { return _seed; }
-          // Non-empty structs are assumed to be decomposed and contain only one child
-          curr_col = cudf::detail::structs_column_device_view(curr_col).get_sliced_child(0);
-        } else if (curr_col.type().id() == type_id::LIST) {
-          curr_col = cudf::detail::lists_column_device_view(curr_col).get_sliced_child();
-        }
-      }
-
-      return cudf::detail::accumulate(
-        thrust::counting_iterator(0),
-        thrust::counting_iterator(curr_col.size()),
-        _seed,
-        [curr_col, nulls = this->_check_nulls] __device__(auto hash, auto element_index) {
-          auto const hasher = hash_functor{nulls, hash, hash};
-          return cudf::type_dispatcher<cudf::experimental::dispatch_void_if_nested>(
-            curr_col.type(), hasher, curr_col, element_index);
-        });
-    }
-
-    Nullate const _check_nulls;  ///< Whether to check for nulls
-    uint32_t const _seed;        ///< The seed to use for hashing, also returned for null elements
-  };
-
-  CUDF_HOST_DEVICE spark_murmur_device_row_hasher(Nullate check_nulls,
-                                                  table_device_view t,
-                                                  uint32_t seed = DEFAULT_HASH_SEED) noexcept
-    : _check_nulls{check_nulls}, _table{t}, _seed(seed)
-  {
-    // Error out if passed an unsupported hash_function
-    static_assert(
-      std::is_base_of_v<Spark_MurmurHash3_x86_32<int>, hash_function<int>>,
-      "spark_murmur_device_row_hasher only supports the Spark_MurmurHash3_x86_32 hash function");
-  }
-
-  Nullate const _check_nulls;
-  table_device_view const _table;
-  uint32_t const _seed;
-};
-
-void check_hash_compatibility(table_view const& input)
-{
-  using column_checker_fn_t = std::function<void(column_view const&)>;
-
-  column_checker_fn_t check_column = [&](column_view const& c) {
-    if (c.type().id() == type_id::LIST) {
-      auto const& list_col = lists_column_view(c);
-      CUDF_EXPECTS(list_col.child().type().id() != type_id::STRUCT,
-                   "Cannot compute hash of a table with a LIST of STRUCT columns.");
-      check_column(list_col.child());
-    } else if (c.type().id() == type_id::STRUCT) {
-      for (auto child = c.child_begin(); child != c.child_end(); ++child) {
-        check_column(*child);
-      }
-    }
-  };
-
-  for (column_view const& c : input) {
-    check_column(c);
-  }
-}
-
-}  // namespace
-
-std::unique_ptr<column> spark_murmurhash3_x86_32(table_view const& input,
-                                                 uint32_t seed,
-                                                 rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
-{
-  auto output = make_numeric_column(data_type(type_to_id<spark_hash_value_type>()),
-                                    input.num_rows(),
-                                    mask_state::UNALLOCATED,
-                                    stream,
-                                    mr);
-
-  // Return early if there's nothing to hash
-  if (input.num_columns() == 0 || input.num_rows() == 0) { return output; }
-
-  // Lists of structs are not supported
-  check_hash_compatibility(input);
-
-  bool const nullable   = has_nested_nulls(input);
-  auto const row_hasher = cudf::experimental::row::hash::row_hasher(input, stream);
-  auto output_view      = output->mutable_view();
-
-  // Compute the hash value for each row
-  thrust::tabulate(
-    rmm::exec_policy(stream),
-    output_view.begin<spark_hash_value_type>(),
-    output_view.end<spark_hash_value_type>(),
-    row_hasher.device_hasher<Spark_MurmurHash3_x86_32, spark_murmur_device_row_hasher>(nullable,
-                                                                                       seed));
-
-  return output;
-}
-
-}  // namespace detail
-
-std::unique_ptr<column> spark_murmurhash3_x86_32(table_view const& input,
-                                                 uint32_t seed,
-                                                 rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::spark_murmurhash3_x86_32(input, seed, stream, mr);
-}
-
-}  // namespace hashing
-}  // namespace cudf
diff --git a/cpp/tests/hashing/spark_murmurhash3_x86_32_test.cpp b/cpp/tests/hashing/spark_murmurhash3_x86_32_test.cpp
deleted file mode 100644
index e8bbfaa2cba..00000000000
--- a/cpp/tests/hashing/spark_murmurhash3_x86_32_test.cpp
+++ /dev/null
@@ -1,576 +0,0 @@
-/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <cudf/detail/iterator.cuh>
-#include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/hashing.hpp>
-
-constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
-
-template <typename T>
-class SparkMurmurHashTestTyped : public cudf::test::BaseFixture {};
-
-TYPED_TEST_SUITE(SparkMurmurHashTestTyped, cudf::test::FixedWidthTypes);
-
-TYPED_TEST(SparkMurmurHashTestTyped, Equality)
-{
-  cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> const col{0, 127, 1, 2, 8};
-  auto const input = cudf::table_view({col});
-
-  // Hash of same input should be equal
-  auto const spark_output1 = cudf::hashing::spark_murmurhash3_x86_32(input, 0);
-  auto const spark_output2 = cudf::hashing::spark_murmurhash3_x86_32(input);
-
-  EXPECT_EQ(input.num_rows(), spark_output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view());
-}
-
-TYPED_TEST(SparkMurmurHashTestTyped, EqualityNulls)
-{
-  using T = TypeParam;
-
-  // Nulls with different values should be equal
-  cudf::test::fixed_width_column_wrapper<T, int32_t> const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
-  cudf::test::fixed_width_column_wrapper<T, int32_t> const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
-
-  auto const input1 = cudf::table_view({col1});
-  auto const input2 = cudf::table_view({col2});
-
-  auto const spark_output1 = cudf::hashing::spark_murmurhash3_x86_32(input1, 0);
-  auto const spark_output2 = cudf::hashing::spark_murmurhash3_x86_32(input2);
-
-  EXPECT_EQ(input1.num_rows(), spark_output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view());
-}
-
-template <typename T>
-class SparkMurmurHashTestFloatTyped : public cudf::test::BaseFixture {};
-
-TYPED_TEST_SUITE(SparkMurmurHashTestFloatTyped, cudf::test::FloatingPointTypes);
-
-TYPED_TEST(SparkMurmurHashTestFloatTyped, TestExtremes)
-{
-  using T = TypeParam;
-  T min   = std::numeric_limits<T>::min();
-  T max   = std::numeric_limits<T>::max();
-  T nan   = std::numeric_limits<T>::quiet_NaN();
-  T inf   = std::numeric_limits<T>::infinity();
-
-  cudf::test::fixed_width_column_wrapper<T> const col(
-    {T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
-  cudf::test::fixed_width_column_wrapper<T> const col_neg_zero(
-    {T(-0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
-  cudf::test::fixed_width_column_wrapper<T> const col_neg_nan(
-    {T(0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf});
-
-  auto const table_col          = cudf::table_view({col});
-  auto const table_col_neg_zero = cudf::table_view({col_neg_zero});
-  auto const table_col_neg_nan  = cudf::table_view({col_neg_nan});
-
-  // Spark hash is sensitive to 0 and -0
-  auto const spark_col         = cudf::hashing::spark_murmurhash3_x86_32(table_col, 0);
-  auto const spark_col_neg_nan = cudf::hashing::spark_murmurhash3_x86_32(table_col_neg_nan);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*spark_col, *spark_col_neg_nan);
-}
-
-class SparkMurmurHashTest : public cudf::test::BaseFixture {};
-
-TEST_F(SparkMurmurHashTest, MultiValueNulls)
-{
-  // Nulls with different values should be equal
-  cudf::test::strings_column_wrapper const strings_col1(
-    {"",
-     "The quick brown fox",
-     "jumps over the lazy dog.",
-     "All work and no play makes Jack a dull boy",
-     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
-    {0, 1, 1, 0, 1});
-  cudf::test::strings_column_wrapper const strings_col2(
-    {"different but null",
-     "The quick brown fox",
-     "jumps over the lazy dog.",
-     "I am Jack's complete lack of null value",
-     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
-    {0, 1, 1, 0, 1});
-
-  // Nulls with different values should be equal
-  using limits = std::numeric_limits<int32_t>;
-  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col1(
-    {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
-  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col2(
-    {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
-
-  // Nulls with different values should be equal
-  // Different truth values should be equal
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1});
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1});
-
-  // Nulls with different values should be equal
-  using ts = cudf::timestamp_s;
-  cudf::test::fixed_width_column_wrapper<ts, ts::duration> const secs_col1(
-    {ts::duration::zero(),
-     static_cast<ts::duration>(100),
-     static_cast<ts::duration>(-100),
-     ts::duration::min(),
-     ts::duration::max()},
-    {1, 0, 0, 1, 1});
-  cudf::test::fixed_width_column_wrapper<ts, ts::duration> const secs_col2(
-    {ts::duration::zero(),
-     static_cast<ts::duration>(-200),
-     static_cast<ts::duration>(200),
-     ts::duration::min(),
-     ts::duration::max()},
-    {1, 0, 0, 1, 1});
-
-  auto const input1        = cudf::table_view({strings_col1, ints_col1, bools_col1, secs_col1});
-  auto const input2        = cudf::table_view({strings_col2, ints_col2, bools_col2, secs_col2});
-  auto const spark_output1 = cudf::hashing::spark_murmurhash3_x86_32(input1, 0);
-  auto const spark_output2 = cudf::hashing::spark_murmurhash3_x86_32(input2);
-
-  EXPECT_EQ(input1.num_rows(), spark_output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view());
-}
-
-TEST_F(SparkMurmurHashTest, MultiValueWithSeeds)
-{
-  // The hash values were determined by running the following Scala code in Apache Spark.
-  // Note that Spark >= 3.2 normalizes the float/double value of -0. to +0. and both values hash
-  // to the same result. This is normalized in the calling code (Spark RAPIDS plugin) for Spark
-  // >= 3.2. However, the reference values for -0. below must be obtained with Spark < 3.2 and
-  // libcudf will continue to implement the Spark < 3.2 behavior until Spark >= 3.2 is required and
-  // the workaround in the calling code is removed. This also affects the combined hash values.
-
-  /*
-  import org.apache.spark.sql.functions._
-  import org.apache.spark.sql.types._
-  import org.apache.spark.sql.Row
-  import org.apache.spark.sql.catalyst.util.DateTimeUtils
-
-  val schema = new StructType()
-      .add("structs", new StructType()
-          .add("a", IntegerType)
-          .add("b", StringType)
-          .add("c", new StructType()
-              .add("x", FloatType)
-              .add("y", LongType)))
-      .add("strings", StringType)
-      .add("doubles", DoubleType)
-      .add("timestamps", TimestampType)
-      .add("decimal64", DecimalType(18, 7))
-      .add("longs", LongType)
-      .add("floats", FloatType)
-      .add("dates", DateType)
-      .add("decimal32", DecimalType(9, 3))
-      .add("ints", IntegerType)
-      .add("shorts", ShortType)
-      .add("bytes", ByteType)
-      .add("bools", BooleanType)
-      .add("decimal128", DecimalType(38, 11))
-
-  val data = Seq(
-      Row(Row(0, "a", Row(0f, 0L)), "", 0.toDouble,
-          DateTimeUtils.toJavaTimestamp(0), BigDecimal(0), 0.toLong, 0.toFloat,
-          DateTimeUtils.toJavaDate(0), BigDecimal(0), 0, 0.toShort, 0.toByte,
-          false, BigDecimal(0)),
-      Row(Row(100, "bc", Row(100f, 100L)), "The quick brown fox", -(0.toDouble),
-          DateTimeUtils.toJavaTimestamp(100), BigDecimal("0.00001"), 100.toLong, -(0.toFloat),
-          DateTimeUtils.toJavaDate(100), BigDecimal("0.1"), 100, 100.toShort, 100.toByte,
-          true, BigDecimal("0.000000001")),
-      Row(Row(-100, "def", Row(-100f, -100L)), "jumps over the lazy dog.", -Double.NaN,
-          DateTimeUtils.toJavaTimestamp(-100), BigDecimal("-0.00001"), -100.toLong, -Float.NaN,
-          DateTimeUtils.toJavaDate(-100), BigDecimal("-0.1"), -100, -100.toShort, -100.toByte,
-          true, BigDecimal("-0.00000000001")),
-      Row(Row(0x12345678, "ghij", Row(Float.PositiveInfinity, 0x123456789abcdefL)),
-          "All work and no play makes Jack a dull boy", Double.MinValue,
-          DateTimeUtils.toJavaTimestamp(Long.MinValue/1000000), BigDecimal("-99999999999.9999999"),
-          Long.MinValue, Float.MinValue, DateTimeUtils.toJavaDate(Int.MinValue/100),
-          BigDecimal("-999999.999"), Int.MinValue, Short.MinValue, Byte.MinValue, true,
-          BigDecimal("-9999999999999999.99999999999")),
-      Row(Row(-0x76543210, "klmno", Row(Float.NegativeInfinity, -0x123456789abcdefL)),
-          "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721", Double.MaxValue,
-          DateTimeUtils.toJavaTimestamp(Long.MaxValue/1000000), BigDecimal("99999999999.9999999"),
-          Long.MaxValue, Float.MaxValue, DateTimeUtils.toJavaDate(Int.MaxValue/100),
-          BigDecimal("999999.999"), Int.MaxValue, Short.MaxValue, Byte.MaxValue, false,
-          BigDecimal("99999999999999999999999999.99999999999")))
-
-  val df = spark.createDataFrame(sc.parallelize(data), schema)
-  df.columns.foreach(c => println(s"$c => ${df.select(hash(col(c))).collect.mkString(",")}"))
-  println(s"combined => ${df.select(hash(col("*"))).collect.mkString(",")}")
-  */
-
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_structs_expected(
-    {-105406170, 90479889, -678041645, 1667387937, 301478567});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_strings_expected(
-    {142593372, 1217302703, -715697185, -2061143941, -111635966});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_doubles_expected(
-    {-1670924195, -853646085, -1281358385, 1897734433, -508695674});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_timestamps_expected(
-    {-1670924195, 1114849490, 904948192, -1832979433, 1752430209});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_decimal64_expected(
-    {-1670924195, 1114849490, 904948192, 1962370902, -1795328666});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_longs_expected(
-    {-1670924195, 1114849490, 904948192, -853646085, -1604625029});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_floats_expected(
-    {933211791, 723455942, -349261430, -1225560532, -338752985});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_dates_expected(
-    {933211791, 751823303, -1080202046, -1906567553, -1503850410});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_decimal32_expected(
-    {-1670924195, 1114849490, 904948192, -1454351396, -193774131});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_ints_expected(
-    {933211791, 751823303, -1080202046, 723455942, 133916647});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_shorts_expected(
-    {933211791, 751823303, -1080202046, -1871935946, 1249274084});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_bytes_expected(
-    {933211791, 751823303, -1080202046, 1110053733, 1135925485});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_bools_expected(
-    {933211791, -559580957, -559580957, -559580957, 933211791});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_decimal128_expected(
-    {-783713497, -295670906, 1398487324, -52622807, -1359749815});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_combined_expected(
-    {401603227, 588162166, 552160517, 1132537411, -326043017});
-
-  using double_limits = std::numeric_limits<double>;
-  using long_limits   = std::numeric_limits<int64_t>;
-  using float_limits  = std::numeric_limits<float>;
-  using int_limits    = std::numeric_limits<int32_t>;
-  cudf::test::fixed_width_column_wrapper<int32_t> a_col{0, 100, -100, 0x1234'5678, -0x7654'3210};
-  cudf::test::strings_column_wrapper b_col{"a", "bc", "def", "ghij", "klmno"};
-  cudf::test::fixed_width_column_wrapper<float> x_col{
-    0.f, 100.f, -100.f, float_limits::infinity(), -float_limits::infinity()};
-  cudf::test::fixed_width_column_wrapper<int64_t> y_col{
-    0L, 100L, -100L, 0x0123'4567'89ab'cdefL, -0x0123'4567'89ab'cdefL};
-  cudf::test::structs_column_wrapper c_col{{x_col, y_col}};
-  cudf::test::structs_column_wrapper const structs_col{{a_col, b_col, c_col}};
-
-  cudf::test::strings_column_wrapper const strings_col(
-    {"",
-     "The quick brown fox",
-     "jumps over the lazy dog.",
-     "All work and no play makes Jack a dull boy",
-     "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"});
-  cudf::test::fixed_width_column_wrapper<double> const doubles_col(
-    {0., -0., -double_limits::quiet_NaN(), double_limits::lowest(), double_limits::max()});
-  cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep> const
-    timestamps_col({0L, 100L, -100L, long_limits::min() / 1000000, long_limits::max() / 1000000});
-  cudf::test::fixed_point_column_wrapper<int64_t> const decimal64_col(
-    {0L, 100L, -100L, -999999999999999999L, 999999999999999999L}, numeric::scale_type{-7});
-  cudf::test::fixed_width_column_wrapper<int64_t> const longs_col(
-    {0L, 100L, -100L, long_limits::min(), long_limits::max()});
-  cudf::test::fixed_width_column_wrapper<float> const floats_col(
-    {0.f, -0.f, -float_limits::quiet_NaN(), float_limits::lowest(), float_limits::max()});
-  cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep> dates_col(
-    {0, 100, -100, int_limits::min() / 100, int_limits::max() / 100});
-  cudf::test::fixed_point_column_wrapper<int32_t> const decimal32_col(
-    {0, 100, -100, -999999999, 999999999}, numeric::scale_type{-3});
-  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col(
-    {0, 100, -100, int_limits::min(), int_limits::max()});
-  cudf::test::fixed_width_column_wrapper<int16_t> const shorts_col({0, 100, -100, -32768, 32767});
-  cudf::test::fixed_width_column_wrapper<int8_t> const bytes_col({0, 100, -100, -128, 127});
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
-  cudf::test::fixed_point_column_wrapper<__int128_t> const decimal128_col(
-    {static_cast<__int128>(0),
-     static_cast<__int128>(100),
-     static_cast<__int128>(-1),
-     (static_cast<__int128>(0xFFFF'FFFF'FCC4'D1C3u) << 64 | 0x602F'7FC3'1800'0001u),
-     (static_cast<__int128>(0x0785'EE10'D5DA'46D9u) << 64 | 0x00F4'369F'FFFF'FFFFu)},
-    numeric::scale_type{-11});
-
-  auto const hash_structs =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({structs_col}), 42);
-  auto const hash_strings =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({strings_col}), 42);
-  auto const hash_doubles =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({doubles_col}), 42);
-  auto const hash_timestamps =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({timestamps_col}), 42);
-  auto const hash_decimal64 =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({decimal64_col}), 42);
-  auto const hash_longs =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({longs_col}), 42);
-  auto const hash_floats =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({floats_col}), 42);
-  auto const hash_dates =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({dates_col}), 42);
-  auto const hash_decimal32 =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({decimal32_col}), 42);
-  auto const hash_ints = cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({ints_col}), 42);
-  auto const hash_shorts =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({shorts_col}), 42);
-  auto const hash_bytes =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({bytes_col}), 42);
-  auto const hash_bools1 =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({bools_col1}), 42);
-  auto const hash_bools2 =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({bools_col2}), 42);
-  auto const hash_decimal128 =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({decimal128_col}), 42);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_structs, hash_structs_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_doubles, hash_doubles_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_timestamps, hash_timestamps_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal64, hash_decimal64_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_longs, hash_longs_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_floats, hash_floats_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_dates, hash_dates_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal32, hash_decimal32_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_ints, hash_ints_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_shorts, hash_shorts_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bytes, hash_bytes_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools1, hash_bools_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools2, hash_bools_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal128, hash_decimal128_expected, verbosity);
-
-  auto const combined_table = cudf::table_view({structs_col,
-                                                strings_col,
-                                                doubles_col,
-                                                timestamps_col,
-                                                decimal64_col,
-                                                longs_col,
-                                                floats_col,
-                                                dates_col,
-                                                decimal32_col,
-                                                ints_col,
-                                                shorts_col,
-                                                bytes_col,
-                                                bools_col2,
-                                                decimal128_col});
-  auto const hash_combined  = cudf::hashing::spark_murmurhash3_x86_32(combined_table, 42);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_combined, hash_combined_expected, verbosity);
-}
-
-TEST_F(SparkMurmurHashTest, StringsWithSeed)
-{
-  // The hash values were determined by running the following Scala code in Apache Spark:
-  // val strs = Seq("", "The quick brown fox",
-  //              "jumps over the lazy dog.",
-  //              "All work and no play makes Jack a dull boy",
-  //              "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721")
-  // println(strs.map(org.apache.spark.unsafe.types.UTF8String.fromString)
-  //   .map(org.apache.spark.sql.catalyst.expressions.Murmur3HashFunction.hash(
-  //     _, org.apache.spark.sql.types.StringType, 314)))
-
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_strings_expected_seed_314(
-    {1467149710, 723257560, -1620282500, -2001858707, 1588473657});
-
-  cudf::test::strings_column_wrapper const strings_col(
-    {"",
-     "The quick brown fox",
-     "jumps over the lazy dog.",
-     "All work and no play makes Jack a dull boy",
-     "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"});
-
-  auto const hash_strings =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({strings_col}), 314);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected_seed_314, verbosity);
-}
-
-TEST_F(SparkMurmurHashTest, ListValues)
-{
-  /*
-  import org.apache.spark.sql.functions._
-  import org.apache.spark.sql.types.{ArrayType, IntegerType, StructType}
-  import org.apache.spark.sql.Row
-
-  val schema = new StructType()
-    .add("lists",ArrayType(ArrayType(IntegerType)))
-
-  val data = Seq(
-    Row(null),
-    Row(List(null)),
-    Row(List(List())),
-    Row(List(List(1))),
-    Row(List(List(1, 2))),
-    Row(List(List(1, 2, 3))),
-    Row(List(List(1, 2), List(3))),
-    Row(List(List(1), List(2, 3))),
-    Row(List(List(1), List(null, 2, 3))),
-    Row(List(List(1, 2), List(3), List(null))),
-    Row(List(List(1, 2), null, List(3))),
-  )
-
-  val df = spark.createDataFrame(
-    spark.sparkContext.parallelize(data), schema)
-
-  val df2 = df.selectExpr("lists", "hash(lists) as hash")
-  df2.printSchema()
-  df2.show(false)
-  */
-
-  auto const null = -1;
-  auto nested_list =
-    cudf::test::lists_column_wrapper<int>({{},
-                                           {1},
-                                           {1, 2},
-                                           {1, 2, 3},
-                                           {1, 2},
-                                           {3},
-                                           {1},
-                                           {2, 3},
-                                           {1},
-                                           {{null, 2, 3}, cudf::test::iterators::nulls_at({0})},
-                                           {1, 2},
-                                           {3},
-                                           {{null}, cudf::test::iterators::nulls_at({0})},
-                                           {1, 2},
-                                           {},
-                                           {3}},
-                                          cudf::test::iterators::nulls_at({0, 14}));
-  auto offsets =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 1, 2, 3, 4, 6, 8, 10, 13, 16};
-  auto list_validity = cudf::test::iterators::nulls_at({0});
-  auto [null_mask, null_count] =
-    cudf::test::detail::make_null_mask(list_validity, list_validity + 11);
-  auto list_column = cudf::make_lists_column(
-    11, offsets.release(), nested_list.release(), null_count, std::move(null_mask));
-
-  auto expect = cudf::test::fixed_width_column_wrapper<int32_t>{42,
-                                                                42,
-                                                                42,
-                                                                -559580957,
-                                                                -222940379,
-                                                                -912918097,
-                                                                -912918097,
-                                                                -912918097,
-                                                                -912918097,
-                                                                -912918097,
-                                                                -912918097};
-
-  auto output = cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({*list_column}), 42);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-}
-
-TEST_F(SparkMurmurHashTest, StructOfListValues)
-{
-  /*
-  import org.apache.spark.sql.functions._
-  import org.apache.spark.sql.types.{ArrayType, IntegerType, StructType}
-  import org.apache.spark.sql.Row
-
-  val schema = new StructType()
-    .add("structs", new StructType()
-        .add("a", ArrayType(IntegerType))
-        .add("b", ArrayType(IntegerType)))
-
-  val data = Seq(
-    Row(Row(List(), List())),
-    Row(Row(List(0), List(0))),
-    Row(Row(List(1, null), null)),
-    Row(Row(List(1, null), List())),
-    Row(Row(List(), List(null, 1))),
-    Row(Row(null, List(1))),
-    Row(Row(List(2, 3), List(4, 5))),
-  )
-
-  val df = spark.createDataFrame(
-    spark.sparkContext.parallelize(data), schema)
-
-  val df2 = df.selectExpr("lists", "hash(lists) as hash")
-  df2.printSchema()
-  df2.show(false)
-  */
-
-  auto const null = -1;
-  auto col1 =
-    cudf::test::lists_column_wrapper<int>({{},
-                                           {0},
-                                           {{1, null}, cudf::test::iterators::nulls_at({1})},
-                                           {{1, null}, cudf::test::iterators::nulls_at({1})},
-                                           {},
-                                           {} /*NULL*/,
-                                           {2, 3}},
-                                          cudf::test::iterators::nulls_at({5}));
-  auto col2 = cudf::test::lists_column_wrapper<int>(
-    {{}, {0}, {} /*NULL*/, {}, {{null, 1}, cudf::test::iterators::nulls_at({0})}, {1}, {4, 5}},
-    cudf::test::iterators::nulls_at({2}));
-  auto struct_column = cudf::test::structs_column_wrapper{{col1, col2}};
-
-  auto expect = cudf::test::fixed_width_column_wrapper<int32_t>{
-    42, 59727262, -559580957, -559580957, -559580957, -559580957, 170038658};
-
-  auto output = cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({struct_column}), 42);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-}
-
-TEST_F(SparkMurmurHashTest, ListOfStructValues)
-{
-  /*
-  import org.apache.spark.sql.functions._
-  import org.apache.spark.sql.types.{ArrayType, IntegerType, StructType}
-  import org.apache.spark.sql.Row
-
-  val schema = new StructType()
-    .add("lists", ArrayType(new StructType()
-      .add("a", IntegerType)
-      .add("b", IntegerType)))
-
-  val data = Seq(
-    Row(List(Row(0, 0))),
-    Row(List(null)),
-    Row(List(Row(null, null))),
-    Row(List(Row(1, null))),
-    Row(List(Row(null, 1))),
-    Row(List(Row(null, 1), Row(2, 3))),
-    Row(List(Row(2, 3), null)),
-    Row(List(Row(2, 3), Row(4, 5))),
-  )
-
-  val df = spark.createDataFrame(
-    spark.sparkContext.parallelize(data), schema)
-
-  val df2 = df.selectExpr("lists", "hash(lists) as hash")
-  df2.printSchema()
-  df2.show(false)
-  */
-
-  auto const null = -1;
-  auto col1       = cudf::test::fixed_width_column_wrapper<int32_t>(
-    {0, null, null, 1, null, null, 2, 2, null, 2, 4},
-    cudf::test::iterators::nulls_at({1, 2, 4, 5, 8}));
-  auto col2 = cudf::test::fixed_width_column_wrapper<int32_t>(
-    {0, null, null, null, 1, 1, 3, 3, null, 3, 5}, cudf::test::iterators::nulls_at({1, 2, 3, 8}));
-  auto struct_column =
-    cudf::test::structs_column_wrapper{{col1, col2}, {1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1}};
-  auto offsets =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 1, 2, 3, 4, 5, 7, 9, 11};
-  auto list_nullmask = std::vector<bool>(1, 8);
-  auto [null_mask, null_count] =
-    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
-  auto list_column = cudf::make_lists_column(
-    8, offsets.release(), struct_column.release(), null_count, std::move(null_mask));
-
-  // TODO: Lists of structs are not yet supported. Once support is added,
-  // remove this EXPECT_THROW and uncomment the rest of this test.
-  EXPECT_THROW(cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({*list_column}), 42),
-               cudf::logic_error);
-
-  /*
-  auto expect = cudf::test::fixed_width_column_wrapper<int32_t>{
-    59727262, 42, 42, -559580957, -559580957, -912918097, 1092624418, 170038658};
-
-  auto output = cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({*list_column}), 42);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-  */
-}
diff --git a/cpp/tests/partitioning/hash_partition_test.cpp b/cpp/tests/partitioning/hash_partition_test.cpp
index 4177ee9bc98..521e1193036 100644
--- a/cpp/tests/partitioning/hash_partition_test.cpp
+++ b/cpp/tests/partitioning/hash_partition_test.cpp
@@ -193,21 +193,6 @@ TEST_F(HashPartition, IdentityHashFailure)
     cudf::logic_error);
 }
 
-TEST_F(HashPartition, UnsupportedHashFunction)
-{
-  fixed_width_column_wrapper<float> floats({1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f});
-  fixed_width_column_wrapper<int16_t> integers({1, 2, 3, 4, 5, 6, 7, 8});
-  strings_column_wrapper strings({"a", "bb", "ccc", "d", "ee", "fff", "gg", "h"});
-  auto input = cudf::table_view({floats, integers, strings});
-
-  auto columns_to_hash = std::vector<cudf::size_type>({2});
-
-  cudf::size_type const num_partitions = 3;
-  EXPECT_THROW(
-    cudf::hash_partition(input, columns_to_hash, num_partitions, cudf::hash_id::HASH_MD5),
-    cudf::logic_error);
-}
-
 TEST_F(HashPartition, CustomSeedValue)
 {
   fixed_width_column_wrapper<float> floats({1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f});
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index ba58f53931b..5a0fbd224ad 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -772,42 +772,7 @@ public static ColumnVector md5Hash(ColumnView... columns) {
           "Unsupported nested type column";
       columnViews[i] = columns[i].getNativeView();
     }
-    return new ColumnVector(hash(columnViews, HashType.HASH_MD5.getNativeId(), 0));
-  }
-
-  /**
-   * Create a new vector containing spark's 32-bit murmur3 hash of each row in the table.
-   * Spark's murmur3 hash uses a different tail processing algorithm.
-   *
-   * @param seed integer seed for the murmur3 hash function
-   * @param columns array of columns to hash, must have identical number of rows.
-   * @return the new ColumnVector of 32-bit values representing each row's hash value.
-   */
-  public static ColumnVector spark32BitMurmurHash3(int seed, ColumnView columns[]) {
-    if (columns.length < 1) {
-      throw new IllegalArgumentException("Murmur3 hashing requires at least 1 column of input");
-    }
-    long[] columnViews = new long[columns.length];
-    long size = columns[0].getRowCount();
-
-    for(int i = 0; i < columns.length; i++) {
-      assert columns[i] != null : "Column vectors passed may not be null";
-      assert columns[i].getRowCount() == size : "Row count mismatch, all columns must be the same size";
-      assert !columns[i].getType().isDurationType() : "Unsupported column type Duration";
-      columnViews[i] = columns[i].getNativeView();
-    }
-    return new ColumnVector(hash(columnViews, HashType.HASH_SPARK_MURMUR3.getNativeId(), seed));
-  }
-
-  /**
-   * Create a new vector containing spark's 32-bit murmur3 hash of each row in the table with the
-   * seed set to 0. Spark's murmur3 hash uses a different tail processing algorithm.
-   *
-   * @param columns array of columns to hash, must have identical number of rows.
-   * @return the new ColumnVector of 32-bit values representing each row's hash value.
-   */
-  public static ColumnVector spark32BitMurmurHash3(ColumnView columns[]) {
-    return spark32BitMurmurHash3(0, columns);
+    return new ColumnVector(md5(columnViews));
   }
 
   /**
@@ -914,15 +879,12 @@ private static native long stringConcatenationSepCol(long[] columnViews,
                                                        boolean separate_nulls);
 
   /**
-   * Native method to hash each row of the given table. Hashing function dispatched on the
-   * native side using the hashId.
+   * Native method to MD5 hash each row of the given table
    *
    * @param viewHandles array of native handles to the cudf::column_view columns being operated on.
-   * @param hashId integer native ID of the hashing function identifier HashType.
-   * @param seed integer seed for the hash. Only used by serial murmur3 hash.
    * @return native handle of the resulting cudf column containing the hex-string hashing results.
    */
-  private static native long hash(long[] viewHandles, int hashId, int seed) throws CudfException;
+  private static native long md5(long[] viewHandles) throws CudfException;
 
   /////////////////////////////////////////////////////////////////////////////
   // INTERNAL/NATIVE ACCESS
diff --git a/java/src/main/java/ai/rapids/cudf/HashType.java b/java/src/main/java/ai/rapids/cudf/HashType.java
index 081e8aa6700..50d6b866579 100644
--- a/java/src/main/java/ai/rapids/cudf/HashType.java
+++ b/java/src/main/java/ai/rapids/cudf/HashType.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -23,9 +23,7 @@
  */
 public enum HashType {
   IDENTITY(0),
-  MURMUR3(1),
-  HASH_SPARK_MURMUR3(2),
-  HASH_MD5(3);
+  MURMUR3(1);
 
   private static final HashType[] HASH_TYPES = HashType.values();
   final int nativeId;
diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index 8fb7df78c09..e8a89f82a13 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -305,16 +305,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatenate(JNIEnv *env
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_hash(JNIEnv *env, jobject j_object,
-                                                              jlongArray column_handles,
-                                                              jint hash_function_id, jint seed) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_md5(JNIEnv *env, jobject j_object,
+                                                             jlongArray column_handles) {
   JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
 
   try {
     auto column_views =
         cudf::jni::native_jpointerArray<cudf::column_view>{env, column_handles}.get_dereferenced();
-    return release_as_jlong(cudf::hash(cudf::table_view{column_views},
-                                       static_cast<cudf::hash_id>(hash_function_id), seed));
+    return release_as_jlong(cudf::hashing::md5(cudf::table_view{column_views}));
   }
   CATCH_STD(env, 0);
 }
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index bac4d1e4b3e..1d6a3b3304a 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -530,225 +530,6 @@ void testMD5HashLists() {
     }
   }
 
-  @Test
-  void testSpark32BitMurmur3HashStrings() {
-    try (ColumnVector v0 = ColumnVector.fromStrings(
-           "a", "B\nc",  "dE\"\u0100\t\u0101 \ud720\ud721\\Fg2\'",
-           "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
-           "in the MD5 hash function. This string needed to be longer.A 60 character string to " +
-           "test MD5's message padding algorithm",
-           "hiJ\ud720\ud721\ud720\ud721", null);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v0});
-         ColumnVector expected = ColumnVector.fromBoxedInts(1485273170, 1709559900, 1423943036, 176121990, 1199621434, 42)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashInts() {
-    try (ColumnVector v0 = ColumnVector.fromBoxedInts(0, 100, null, null, Integer.MIN_VALUE, null);
-         ColumnVector v1 = ColumnVector.fromBoxedInts(0, null, -100, null, null, Integer.MAX_VALUE);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v0, v1});
-         ColumnVector expected = ColumnVector.fromBoxedInts(59727262, 751823303, -1080202046, 42, 723455942, 133916647)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashDoubles() {
-    try (ColumnVector v = ColumnVector.fromBoxedDoubles(
-          0.0, null, 100.0, -100.0, Double.MIN_NORMAL, Double.MAX_VALUE,
-          POSITIVE_DOUBLE_NAN_UPPER_RANGE, POSITIVE_DOUBLE_NAN_LOWER_RANGE,
-          NEGATIVE_DOUBLE_NAN_UPPER_RANGE, NEGATIVE_DOUBLE_NAN_LOWER_RANGE,
-          Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(new ColumnVector[]{v});
-         ColumnVector expected = ColumnVector.fromBoxedInts(1669671676, 0, -544903190, -1831674681, 150502665, 474144502, 1428788237, 1428788237, 1428788237, 1428788237, 420913893, 1915664072)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashTimestamps() {
-    // The hash values were derived from Apache Spark in a manner similar to the one documented at
-    // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307
-    try (ColumnVector v = ColumnVector.timestampMicroSecondsFromBoxedLongs(
-        0L, null, 100L, -100L, 0x123456789abcdefL, null, -0x123456789abcdefL);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v});
-         ColumnVector expected = ColumnVector.fromBoxedInts(-1670924195, 42, 1114849490, 904948192, 657182333, 42, -57193045)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashDecimal64() {
-    // The hash values were derived from Apache Spark in a manner similar to the one documented at
-    // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307
-    try (ColumnVector v = ColumnVector.decimalFromLongs(-7,
-        0L, 100L, -100L, 0x123456789abcdefL, -0x123456789abcdefL);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v});
-         ColumnVector expected = ColumnVector.fromBoxedInts(-1670924195, 1114849490, 904948192, 657182333, -57193045)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashDecimal32() {
-    // The hash values were derived from Apache Spark in a manner similar to the one documented at
-    // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307
-    try (ColumnVector v = ColumnVector.decimalFromInts(-3,
-        0, 100, -100, 0x12345678, -0x12345678);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v});
-         ColumnVector expected = ColumnVector.fromBoxedInts(-1670924195, 1114849490, 904948192, -958054811, -1447702630)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashDates() {
-    // The hash values were derived from Apache Spark in a manner similar to the one documented at
-    // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307
-    try (ColumnVector v = ColumnVector.timestampDaysFromBoxedInts(
-        0, null, 100, -100, 0x12345678, null, -0x12345678);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v});
-         ColumnVector expected = ColumnVector.fromBoxedInts(933211791, 42, 751823303, -1080202046, -1721170160, 42, 1852996993)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashFloats() {
-    try (ColumnVector v = ColumnVector.fromBoxedFloats(
-          0f, 100f, -100f, Float.MIN_NORMAL, Float.MAX_VALUE, null,
-          POSITIVE_FLOAT_NAN_LOWER_RANGE, POSITIVE_FLOAT_NAN_UPPER_RANGE,
-          NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE,
-          Float.POSITIVE_INFINITY, Float.NEGATIVE_INFINITY);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(411, new ColumnVector[]{v});
-         ColumnVector expected = ColumnVector.fromBoxedInts(-235179434, 1812056886, 2028471189, 1775092689, -1531511762, 411, -1053523253, -1053523253, -1053523253, -1053523253, -1526256646, 930080402)){
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashBools() {
-    try (ColumnVector v0 = ColumnVector.fromBoxedBooleans(null, true, false, true, null, false);
-         ColumnVector v1 = ColumnVector.fromBoxedBooleans(null, true, false, null, false, true);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(0, new ColumnVector[]{v0, v1});
-         ColumnVector expected = ColumnVector.fromBoxedInts(0, -1589400010, -239939054, -68075478, 593689054, -1194558265)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashMixed() {
-    try (ColumnVector strings = ColumnVector.fromStrings(
-          "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721",
-          "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
-          "in the MD5 hash function. This string needed to be longer.",
-          null, null);
-         ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null);
-         ColumnVector doubles = ColumnVector.fromBoxedDoubles(
-          0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
-         ColumnVector floats = ColumnVector.fromBoxedFloats(
-          0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
-         ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(1868, new ColumnVector[]{strings, integers, doubles, floats, bools});
-         ColumnVector expected = ColumnVector.fromBoxedInts(1936985022, 720652989, 339312041, 1400354989, 769988643, 1868)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashStruct() {
-    try (ColumnVector strings = ColumnVector.fromStrings(
-        "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721",
-        "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
-            "in the MD5 hash function. This string needed to be longer.",
-        null, null);
-         ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null);
-         ColumnVector doubles = ColumnVector.fromBoxedDoubles(
-             0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
-         ColumnVector floats = ColumnVector.fromBoxedFloats(
-             0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
-         ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null);
-         ColumnView structs = ColumnView.makeStructView(strings, integers, doubles, floats, bools);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{structs});
-         ColumnVector expected = ColumnVector.spark32BitMurmurHash3(1868, new ColumnVector[]{strings, integers, doubles, floats, bools})) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashNestedStruct() {
-    try (ColumnVector strings = ColumnVector.fromStrings(
-        "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721",
-        "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
-            "in the MD5 hash function. This string needed to be longer.",
-        null, null);
-         ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null);
-         ColumnVector doubles = ColumnVector.fromBoxedDoubles(
-             0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
-         ColumnVector floats = ColumnVector.fromBoxedFloats(
-             0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
-         ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null);
-         ColumnView structs1 = ColumnView.makeStructView(strings, integers);
-         ColumnView structs2 = ColumnView.makeStructView(structs1, doubles);
-         ColumnView structs3 = ColumnView.makeStructView(bools);
-         ColumnView structs = ColumnView.makeStructView(structs2, floats, structs3);
-         ColumnVector expected = ColumnVector.spark32BitMurmurHash3(1868, new ColumnVector[]{strings, integers, doubles, floats, bools});
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{structs})) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashListsAndNestedLists() {
-    try (ColumnVector stringListCV = ColumnVector.fromLists(
-             new ListType(true, new BasicType(true, DType.STRING)),
-             Arrays.asList(null, "a"),
-             Arrays.asList("B\n", ""),
-             Arrays.asList("dE\"\u0100\t\u0101", " \ud720\ud721"),
-             Collections.singletonList("A very long (greater than 128 bytes/char string) to test a multi" +
-             " hash-step data point in the Murmur3 hash function. This string needed to be longer."),
-             Collections.singletonList(""),
-             null);
-         ColumnVector strings1 = ColumnVector.fromStrings(
-             "a", "B\n", "dE\"\u0100\t\u0101",
-             "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
-             "in the Murmur3 hash function. This string needed to be longer.", null, null);
-         ColumnVector strings2 = ColumnVector.fromStrings(
-             null, "", " \ud720\ud721", null, "", null);
-         ColumnView stringStruct = ColumnView.makeStructView(strings1, strings2);
-         ColumnVector stringExpected = ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{stringStruct});
-         ColumnVector stringResult = ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{stringListCV});
-         ColumnVector intListCV = ColumnVector.fromLists(
-             new ListType(true, new BasicType(true, DType.INT32)),
-             null,
-             Arrays.asList(0, -2, 3),
-             Collections.singletonList(Integer.MAX_VALUE),
-             Arrays.asList(5, -6, null),
-             Collections.singletonList(Integer.MIN_VALUE),
-             null);
-         ColumnVector integers1 = ColumnVector.fromBoxedInts(null, 0, null, 5, Integer.MIN_VALUE, null);
-         ColumnVector integers2 = ColumnVector.fromBoxedInts(null, -2, Integer.MAX_VALUE, null, null, null);
-         ColumnVector integers3 = ColumnVector.fromBoxedInts(null, 3, null, -6, null, null);
-         ColumnVector intExpected =
-             ColumnVector.spark32BitMurmurHash3(1868, new ColumnVector[]{integers1, integers2, integers3});
-         ColumnVector intResult = ColumnVector.spark32BitMurmurHash3(1868, new ColumnVector[]{intListCV});
-         ColumnVector doubles = ColumnVector.fromBoxedDoubles(
-          0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
-         ColumnVector floats = ColumnVector.fromBoxedFloats(
-          0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
-         ColumnView structCV = ColumnView.makeStructView(intListCV, stringListCV, doubles, floats);
-         ColumnVector nestedExpected =
-             ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{intListCV, strings1, strings2, doubles, floats});
-         ColumnVector nestedResult =
-             ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{structCV})) {
-      assertColumnsAreEqual(stringExpected, stringResult);
-      assertColumnsAreEqual(intExpected, intResult);
-      assertColumnsAreEqual(nestedExpected, nestedResult);
-    }
-  }
-
   @Test
   void isNotNullTestEmptyColumn() {
     try (ColumnVector v = ColumnVector.fromBoxedInts();

From 43994fadf6c9c2bd6b599c79999f62a23d57b18a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 4 Apr 2024 16:49:28 -0400
Subject: [PATCH 029/272] Fix base_normalator::integer_sizeof_fn integer
 dispatch (#15457)

Fixes the `cudf::detail::base_normalator::integer_sizeof_fn` dispatch function to support only integers. Also remove the `constexpr` since the non-integer path can throw an exception.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/15457
---
 cpp/include/cudf/detail/normalizing_iterator.cuh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cudf/detail/normalizing_iterator.cuh b/cpp/include/cudf/detail/normalizing_iterator.cuh
index 8f90afc3e57..32df13104e0 100644
--- a/cpp/include/cudf/detail/normalizing_iterator.cuh
+++ b/cpp/include/cudf/detail/normalizing_iterator.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -204,8 +204,8 @@ struct alignas(16) base_normalator {
 
  private:
   struct integer_sizeof_fn {
-    template <typename T, CUDF_ENABLE_IF(not cudf::is_fixed_width<T>())>
-    CUDF_HOST_DEVICE constexpr std::size_t operator()() const
+    template <typename T, CUDF_ENABLE_IF(not cudf::is_integral_not_bool<T>())>
+    CUDF_HOST_DEVICE std::size_t operator()() const
     {
 #ifndef __CUDA_ARCH__
       CUDF_FAIL("only integral types are supported");
@@ -213,8 +213,8 @@ struct alignas(16) base_normalator {
       CUDF_UNREACHABLE("only integral types are supported");
 #endif
     }
-    template <typename T, CUDF_ENABLE_IF(cudf::is_fixed_width<T>())>
-    CUDF_HOST_DEVICE constexpr std::size_t operator()() const noexcept
+    template <typename T, CUDF_ENABLE_IF(cudf::is_integral_not_bool<T>())>
+    CUDF_HOST_DEVICE std::size_t operator()() const noexcept
     {
       return sizeof(T);
     }

From d7b8fc4de4107b6ee95cdeb26e7efecd3adf9325 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 4 Apr 2024 16:50:23 -0400
Subject: [PATCH 030/272] Remove empty elements from exploded character-ngrams
 output (#15371)

Fixes `character_ngrams` function to not include empty entries when `as_list=False`. That is, the exploded view (non-list result) should not contain empty or NA elements.

This PR changes the `nvtext::generate_character_ngrams()` API to return a lists column instead of a flat strings column. The python code had been converting the return object into lists column and then exploding it if `as_list=False`. Returning as a list column simplifies the logic and prevents the double conversion. There is almost no impact to the nvtext code since the offsets for the output lists column were already being generated.

All tests were updated to expect the new result. Also changed some exception types from `cudf::logic_error` to `std::invalid_argument` as appropriate.

Continues work of abandoned PR #14685

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15371
---
 cpp/include/nvtext/generate_ngrams.hpp        | 18 +++--
 cpp/src/text/generate_ngrams.cu               | 57 ++++++++--------
 cpp/tests/text/ngrams_tests.cpp               | 66 ++++++++++---------
 python/cudf/cudf/core/column/string.py        | 25 ++-----
 .../cudf/cudf/tests/text/test_text_methods.py | 10 +--
 5 files changed, 83 insertions(+), 93 deletions(-)

diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp
index 46f2c0e7bc9..e3d667f0292 100644
--- a/cpp/include/nvtext/generate_ngrams.hpp
+++ b/cpp/include/nvtext/generate_ngrams.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -62,21 +62,19 @@ std::unique_ptr<cudf::column> generate_ngrams(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Generates ngrams of characters within each string.
+ * @brief Generates ngrams of characters within each string
  *
- * Each character of a string used to build ngrams.
+ * Each character of a string is used to build ngrams for the output row.
  * Ngrams are not created across strings.
  *
  * ```
- * ["ab", "cde", "fgh"] would generate bigrams as ["ab", "cd", "de", "fg", "gh"]
+ * ["ab", "cde", "fgh"] would generate bigrams as
+ * [["ab"], ["cd", "de"], ["fg", "gh"]]
  * ```
  *
- * The size of the output column will be the total number of ngrams generated from
- * the input strings column.
- *
- * All null row entries are ignored and the output contains all valid rows.
+ * All null row entries are ignored and the corresponding output row will be empty.
  *
- * @throw cudf::logic_error if `ngrams < 2`
+ * @throw std::invalid_argument if `ngrams < 2`
  * @throw cudf::logic_error if there are not enough characters to generate any ngrams
  *
  * @param input Strings column to produce ngrams from
@@ -84,7 +82,7 @@ std::unique_ptr<cudf::column> generate_ngrams(
  *               Default is 2 = bigram.
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New strings columns of tokens
+ * @return Lists column of strings
  */
 std::unique_ptr<cudf::column> generate_character_ngrams(
   cudf::strings_column_view const& input,
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 3290b58101d..d2a0ef71e4a 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -40,6 +40,8 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform_scan.h>
 
+#include <stdexcept>
+
 namespace nvtext {
 namespace detail {
 namespace {
@@ -90,9 +92,12 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(separator.is_valid(stream), "Parameter separator must be valid");
+  CUDF_EXPECTS(
+    separator.is_valid(stream), "Parameter separator must be valid", std::invalid_argument);
   cudf::string_view const d_separator(separator.data(), separator.size());
-  CUDF_EXPECTS(ngrams > 1, "Parameter ngrams should be an integer value of 2 or greater");
+  CUDF_EXPECTS(ngrams > 1,
+               "Parameter ngrams should be an integer value of 2 or greater",
+               std::invalid_argument);
 
   auto strings_count = strings.size();
   if (strings_count == 0)  // if no strings, return an empty column
@@ -196,47 +201,45 @@ struct character_ngram_generator_fn {
 };
 }  // namespace
 
-std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& input,
                                                         cudf::size_type ngrams,
                                                         rmm::cuda_stream_view stream,
                                                         rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(ngrams > 1, "Parameter ngrams should be an integer value of 2 or greater");
+  CUDF_EXPECTS(ngrams >= 2,
+               "Parameter ngrams should be an integer value of 2 or greater",
+               std::invalid_argument);
 
-  auto const strings_count = strings.size();
-  if (strings_count == 0)  // if no strings, return an empty column
+  auto const strings_count = input.size();
+  if (strings_count == 0) {  // if no strings, return an empty column
     return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
+  }
 
-  auto const strings_column = cudf::column_device_view::create(strings.parent(), stream);
-  auto const d_strings      = *strings_column;
+  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
 
-  // create a vector of ngram offsets for each string
-  rmm::device_uvector<cudf::size_type> ngram_offsets(strings_count + 1, stream);
-  thrust::transform_exclusive_scan(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<cudf::size_type>(0),
-    thrust::make_counting_iterator<cudf::size_type>(strings_count + 1),
-    ngram_offsets.begin(),
+  auto sizes_itr = cudf::detail::make_counting_transform_iterator(
+    0,
     cuda::proclaim_return_type<cudf::size_type>(
-      [d_strings, strings_count, ngrams] __device__(auto idx) {
-        if (d_strings.is_null(idx) || (idx == strings_count)) return 0;
+      [d_strings = *d_strings, ngrams] __device__(auto idx) {
+        if (d_strings.is_null(idx)) { return 0; }
         auto const length = d_strings.element<cudf::string_view>(idx).length();
         return std::max(0, static_cast<cudf::size_type>(length + 1 - ngrams));
-      }),
-    cudf::size_type{0},
-    thrust::plus<cudf::size_type>());
-
-  // total ngrams count is the last entry
-  cudf::size_type const total_ngrams = ngram_offsets.back_element(stream);
+      }));
+  auto [offsets, total_ngrams] =
+    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+  auto d_offsets = offsets->view().data<cudf::size_type>();
   CUDF_EXPECTS(total_ngrams > 0,
                "Insufficient number of characters in each string to generate ngrams");
 
-  character_ngram_generator_fn generator{d_strings, ngrams, ngram_offsets.data()};
+  character_ngram_generator_fn generator{*d_strings, ngrams, d_offsets};
   auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
     generator, strings_count, total_ngrams, stream, mr);
 
-  return cudf::make_strings_column(
+  auto output = cudf::make_strings_column(
     total_ngrams, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});
+
+  return make_lists_column(
+    input.size(), std::move(offsets), std::move(output), 0, rmm::device_buffer{}, stream, mr);
 }
 
 namespace {
@@ -277,7 +280,9 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
                                                     rmm::cuda_stream_view stream,
                                                     rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(ngrams >= 2, "Parameter ngrams should be an integer value of 2 or greater");
+  CUDF_EXPECTS(ngrams >= 2,
+               "Parameter ngrams should be an integer value of 2 or greater",
+               std::invalid_argument);
 
   auto output_type = cudf::data_type{cudf::type_to_id<cudf::hash_value_type>()};
   if (input.is_empty()) { return cudf::make_empty_column(output_type); }
diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp
index c5a5a342471..1acb4fc4265 100644
--- a/cpp/tests/text/ngrams_tests.cpp
+++ b/cpp/tests/text/ngrams_tests.cpp
@@ -17,6 +17,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/column/column.hpp>
@@ -50,29 +51,24 @@ TEST_F(TextGenerateNgramsTest, Ngrams)
     auto const results = nvtext::generate_ngrams(strings_view, 3, separator);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
   {
-    cudf::test::strings_column_wrapper expected{"th",
-                                                "he",
-                                                "fo",
-                                                "ox",
-                                                "ju",
-                                                "um",
-                                                "mp",
-                                                "pe",
-                                                "ed",
-                                                "ov",
-                                                "ve",
-                                                "er",
-                                                "th",
-                                                "hé",
-                                                "do",
-                                                "og"};
+    LCW expected({LCW({"th", "he"}),
+                  LCW({"fo", "ox"}),
+                  LCW({"ju", "um", "mp", "pe", "ed"}),
+                  LCW({"ov", "ve", "er"}),
+                  LCW({"th", "hé"}),
+                  LCW({"do", "og"})});
     auto const results = nvtext::generate_character_ngrams(strings_view, 2);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    cudf::test::strings_column_wrapper expected{
-      "the", "fox", "jum", "ump", "mpe", "ped", "ove", "ver", "thé", "dog"};
+    LCW expected({LCW({"the"}),
+                  LCW({"fox"}),
+                  LCW({"jum", "ump", "mpe", "ped"}),
+                  LCW({"ove", "ver"}),
+                  LCW({"thé"}),
+                  LCW({"dog"})});
     auto const results = nvtext::generate_character_ngrams(strings_view, 3);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
@@ -80,24 +76,29 @@ TEST_F(TextGenerateNgramsTest, Ngrams)
 
 TEST_F(TextGenerateNgramsTest, NgramsWithNulls)
 {
-  std::vector<char const*> h_strings{"the", "fox", "", "jumped", "over", nullptr, "the", "dog"};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  auto validity = cudf::test::iterators::null_at(5);
+  cudf::test::strings_column_wrapper input({"the", "fox", "", "jumped", "over", "", "the", "dog"},
+                                           validity);
   auto const separator = cudf::string_scalar("_");
 
-  cudf::strings_column_view strings_view(strings);
+  cudf::strings_column_view sv(input);
   {
-    auto const results = nvtext::generate_ngrams(strings_view, 3, separator);
+    auto const results = nvtext::generate_ngrams(sv, 3, separator);
     cudf::test::strings_column_wrapper expected{
       "the_fox_jumped", "fox_jumped_over", "jumped_over_the", "over_the_dog"};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    cudf::test::strings_column_wrapper expected{
-      "the", "fox", "jum", "ump", "mpe", "ped", "ove", "ver", "the", "dog"};
-    auto const results = nvtext::generate_character_ngrams(strings_view, 3);
+    using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+    LCW expected({LCW({"the"}),
+                  LCW({"fox"}),
+                  LCW{},
+                  LCW({"jum", "ump", "mpe", "ped"}),
+                  LCW({"ove", "ver"}),
+                  LCW{},
+                  LCW({"the"}),
+                  LCW({"dog"})});
+    auto const results = nvtext::generate_character_ngrams(sv, 3);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }
@@ -121,9 +122,12 @@ TEST_F(TextGenerateNgramsTest, Errors)
   auto const separator = cudf::string_scalar("_");
   // invalid parameter value
   EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1, separator),
-               cudf::logic_error);
+               std::invalid_argument);
   EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 1),
-               cudf::logic_error);
+               std::invalid_argument);
+  auto const invalid_separator = cudf::string_scalar("", false);
+  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 2, invalid_separator),
+               std::invalid_argument);
   // not enough strings to generate ngrams
   EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3, separator),
                cudf::logic_error);
@@ -165,7 +169,7 @@ TEST_F(TextGenerateNgramsTest, NgramsHashErrors)
   auto view  = cudf::strings_column_view(input);
 
   // invalid parameter value
-  EXPECT_THROW(nvtext::hash_character_ngrams(view, 1), cudf::logic_error);
+  EXPECT_THROW(nvtext::hash_character_ngrams(view, 1), std::invalid_argument);
   // strings not long enough to generate ngrams
   EXPECT_THROW(nvtext::hash_character_ngrams(view), cudf::logic_error);
 }
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 06d7aa030db..0862995bc46 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4830,27 +4830,14 @@ def character_ngrams(
         2         [xyz]
         dtype: list
         """
-        ngrams = libstrings.generate_character_ngrams(self._column, n)
-
-        # convert the output to a list by just generating the
-        # offsets for the output list column
-        sn = (self.len() - (n - 1)).clip(0, None).fillna(0)  # type: ignore
-        sizes = libcudf.concat.concat_columns(
-            [column.as_column(0, dtype=np.int32, length=1), sn._column]
-        )
-        oc = libcudf.reduce.scan("cumsum", sizes, True)
-        lc = cudf.core.column.ListColumn(
-            size=self._column.size,
-            dtype=cudf.ListDtype(self._column.dtype),
-            mask=self._column.mask,
-            offset=0,
-            null_count=self._column.null_count,
-            children=(oc, ngrams),
+        result = self._return_or_inplace(
+            libstrings.generate_character_ngrams(self._column, n),
+            retain_index=True,
         )
-        result = self._return_or_inplace(lc, retain_index=True)
-
         if isinstance(result, cudf.Series) and not as_list:
-            return result.explode()
+            # before exploding, removes those lists which have 0 length
+            result = result[result.list.len() > 0]
+            return result.explode()  # type: ignore
         return result
 
     def hash_character_ngrams(
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 2dccd583b23..6ecead862bb 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import random
 import string
@@ -330,9 +330,8 @@ def test_ngrams(n, separator, expected_values):
                 "he",
                 "er",
                 "re",
-                cudf.NA,
             ],
-            [1, 1, 1, 2, 3, 4, 4, 4, 5, 5, 5, 6],
+            [1, 1, 1, 2, 3, 4, 4, 4, 5, 5, 5],
             False,
         ),
         (
@@ -340,15 +339,12 @@ def test_ngrams(n, separator, expected_values):
             [
                 "thi",
                 "his",
-                cudf.NA,
-                cudf.NA,
                 "boo",
                 "ook",
                 "her",
                 "ere",
-                cudf.NA,
             ],
-            [1, 1, 2, 3, 4, 4, 5, 5, 6],
+            [1, 1, 4, 4, 5, 5],
             False,
         ),
         (

From 4e44d5d3c80852a15ae28d5afa0b13646ca3a4fd Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 4 Apr 2024 16:51:40 -0400
Subject: [PATCH 031/272] Large strings support in cudf::concatenate (#15195)

Enables `cudf::concatenate` to create and return a large strings column (offsets are INT64).

This also introduces the `LIBCUDF_LARGE_STRINGS_ENABLED` environment variable and utilities around it.
One internal utility checks the value so appropriate logic can either throw an overflow exception or build INT64 offsets as appropriate.

The `cudf::test::large_strings_enabler` is introduced to set/unset the env var for individual tests are needed.
A follow on PR will attempt to consolidate these kinds of tests with a specialized test fixture using this utility class.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/15195
---
 cpp/include/cudf/strings/detail/utilities.hpp | 27 +++++++
 cpp/include/cudf_test/column_utilities.hpp    | 25 ++++++
 cpp/src/strings/copying/concatenate.cu        |  6 +-
 cpp/src/strings/utilities.cu                  | 35 ++++++++-
 cpp/tests/copying/concatenate_tests.cpp       | 76 +++++++++++--------
 cpp/tests/utilities/column_utilities.cu       | 11 +++
 6 files changed, 142 insertions(+), 38 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp
index 8d8065dbcaf..cf9a13e9742 100644
--- a/cpp/include/cudf/strings/detail/utilities.hpp
+++ b/cpp/include/cudf/strings/detail/utilities.hpp
@@ -27,6 +27,24 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
+/**
+ * @brief Create an offsets column to be a child of a strings column
+ *
+ * This will return the properly typed column to be filled in by the caller
+ * given the number of bytes to address.
+ *
+ * @param chars_bytes Number of bytes for the chars in the strings column
+ * @param count Number of elements for the offsets column.
+ *              This is the number of rows in the parent strings column +1.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return The offsets child column for a strings column
+ */
+std::unique_ptr<column> create_offsets_child_column(int64_t chars_bytes,
+                                                    size_type count,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr);
+
 /**
  * @brief Creates a string_view vector from a strings column.
  *
@@ -52,6 +70,15 @@ rmm::device_uvector<string_view> create_string_vector_from_column(
  */
 int64_t get_offset64_threshold();
 
+/**
+ * @brief Checks if large strings is enabled
+ *
+ * This checks the setting in the environment variable LIBCUDF_LARGE_STRINGS_ENABLED.
+ *
+ * @return true if large strings are supported
+ */
+bool is_large_strings_enabled();
+
 /**
  * @brief Return a normalized offset value from a strings offsets column
  *
diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp
index a8957473175..c83599a8072 100644
--- a/cpp/include/cudf_test/column_utilities.hpp
+++ b/cpp/include/cudf_test/column_utilities.hpp
@@ -210,6 +210,29 @@ template <>
 std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(column_view c);
 //! @endcond
 
+/**
+ * @brief For enabling large strings testing in specific tests
+ */
+struct large_strings_enabler {
+  /**
+   * @brief Create large strings enable object
+   *
+   * @param default_enable Default enables large strings support
+   */
+  large_strings_enabler(bool default_enable = true);
+  ~large_strings_enabler();
+
+  /**
+   * @brief Enable large strings support
+   */
+  void enable();
+
+  /**
+   * @brief Disable large strings support
+   */
+  void disable();
+};
+
 }  // namespace cudf::test
 
 // Macros for showing line of failure.
@@ -242,3 +265,5 @@ std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(c
     SCOPED_TRACE(" <--  line of failure\n");                        \
     cudf::test::detail::expect_equal_buffers(lhs, rhs, size_bytes); \
   } while (0)
+
+#define CUDF_TEST_ENABLE_LARGE_STRINGS() cudf::test::large_strings_enabler ls___
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index c4564b1105b..de7067f0bed 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -220,9 +220,6 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
   CUDF_EXPECTS(offsets_count <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
                "total number of strings exceeds the column size limit",
                std::overflow_error);
-  CUDF_EXPECTS(total_bytes <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
-               "total size of strings exceeds the column size limit",
-               std::overflow_error);
 
   bool const has_nulls =
     std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
@@ -232,8 +229,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
   auto d_new_chars = output_chars.data();
 
   // create output offsets column
-  auto offsets_column = make_numeric_column(
-    data_type{type_id::INT32}, offsets_count, mask_state::UNALLOCATED, stream, mr);
+  auto offsets_column = create_offsets_child_column(total_bytes, offsets_count, stream, mr);
   auto itr_new_offsets =
     cudf::detail::offsetalator_factory::make_output_iterator(offsets_column->mutable_view());
 
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index 0a7353821b0..c83f827f290 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/strings/detail/char_tables.hpp>
 #include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -31,6 +32,9 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
 
+#include <cstdlib>
+#include <string>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -65,6 +69,27 @@ rmm::device_uvector<string_view> create_string_vector_from_column(
   return strings_vector;
 }
 
+/**
+ * @copydoc cudf::strings::detail::create_offsets_child_column
+ */
+std::unique_ptr<column> create_offsets_child_column(int64_t chars_bytes,
+                                                    size_type count,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr)
+{
+  auto const threshold = get_offset64_threshold();
+  if (!is_large_strings_enabled()) {
+    CUDF_EXPECTS(
+      chars_bytes < threshold, "Size of output exceeds the column size limit", std::overflow_error);
+  }
+  return make_numeric_column(
+    chars_bytes < threshold ? data_type{type_id::INT32} : data_type{type_id::INT64},
+    count,
+    mask_state::UNALLOCATED,
+    stream,
+    mr);
+}
+
 namespace {
 // The device variables are created here to avoid using a singleton that may cause issues
 // with RMM initialize/finalize. See PR #3159 for details on this approach.
@@ -123,13 +148,19 @@ special_case_mapping const* get_special_case_mapping_table()
 
 int64_t get_offset64_threshold()
 {
-  auto const threshold  = std::getenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
-  std::size_t const rtn = threshold != nullptr ? std::atol(threshold) : 0;
+  auto const threshold = std::getenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
+  int64_t const rtn    = threshold != nullptr ? std::atol(threshold) : 0L;
   return (rtn > 0 && rtn < std::numeric_limits<int32_t>::max())
            ? rtn
            : std::numeric_limits<int32_t>::max();
 }
 
+bool is_large_strings_enabled()
+{
+  auto const env = std::getenv("LIBCUDF_LARGE_STRINGS_ENABLED");
+  return env != nullptr && std::string(env) == "1";
+}
+
 int64_t get_offset_value(cudf::column_view const& offsets,
                          size_type index,
                          rmm::cuda_stream_view stream)
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index 0f7c1053adf..3e2e332936e 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -32,6 +32,8 @@
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <thrust/iterator/constant_iterator.h>
+
 #include <numeric>
 #include <stdexcept>
 #include <string>
@@ -164,37 +166,6 @@ TEST_F(StringColumnTest, ConcatenateColumnView)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_F(StringColumnTest, ConcatenateColumnViewLarge)
-{
-  // Test large concatenate, causes out of bound device memory errors if kernel
-  // indexing is not int64_t.
-  // 1.5GB bytes, 5k columns
-  constexpr size_t num_strings        = 10000;
-  constexpr size_t string_length      = 150000;
-  constexpr size_t strings_per_column = 2;
-  constexpr size_t num_columns        = num_strings / strings_per_column;
-
-  std::vector<std::string> strings;
-  std::vector<char const*> h_strings;
-  std::vector<cudf::test::strings_column_wrapper> strings_column_wrappers;
-  std::vector<cudf::column_view> strings_columns;
-
-  std::string s(string_length, 'a');
-  for (size_t i = 0; i < num_strings; ++i)
-    h_strings.push_back(s.data());
-
-  for (size_t i = 0; i < num_columns; ++i)
-    strings_column_wrappers.push_back(cudf::test::strings_column_wrapper(
-      h_strings.data() + i * strings_per_column, h_strings.data() + (i + 1) * strings_per_column));
-  for (auto& wrapper : strings_column_wrappers)
-    strings_columns.push_back(wrapper);
-
-  auto results = cudf::concatenate(strings_columns);
-
-  cudf::test::strings_column_wrapper expected(h_strings.begin(), h_strings.end());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-}
-
 TEST_F(StringColumnTest, ConcatenateManyColumns)
 {
   std::vector<char const*> h_strings{
@@ -226,6 +197,49 @@ TEST_F(StringColumnTest, ConcatenateTooLarge)
   EXPECT_THROW(cudf::concatenate(input_cols), std::overflow_error);
 }
 
+TEST_F(StringColumnTest, ConcatenateLargeStrings)
+{
+  CUDF_TEST_ENABLE_LARGE_STRINGS();
+  auto itr = thrust::constant_iterator<std::string_view>(
+    "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY");                // 50 bytes
+  auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000);  // 250MB
+  auto view  = cudf::column_view(input);
+  std::vector<cudf::column_view> input_cols;
+  std::vector<cudf::size_type> splits;
+  int const multiplier = 10;
+  for (int i = 0; i < multiplier; ++i) {  // 2500MB > 2GB
+    input_cols.push_back(view);
+    splits.push_back(view.size() * (i + 1));
+  }
+  splits.pop_back();  // remove last entry
+  auto result = cudf::concatenate(input_cols);
+  auto sv     = cudf::strings_column_view(result->view());
+  EXPECT_EQ(sv.size(), view.size() * multiplier);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+
+  // verify results in sections
+  auto sliced = cudf::split(result->view(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+
+  // also test with large strings column as input
+  {
+    input_cols.clear();
+    input_cols.push_back(input);           // regular column
+    input_cols.push_back(result->view());  // large column
+    result = cudf::concatenate(input_cols);
+    sv     = cudf::strings_column_view(result->view());
+    EXPECT_EQ(sv.size(), view.size() * (multiplier + 1));
+    EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+    splits.push_back(view.size() * multiplier);
+    sliced = cudf::split(result->view(), splits);
+    for (auto c : sliced) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+    }
+  }
+}
+
 struct TableTest : public cudf::test::BaseFixture {};
 
 TEST_F(TableTest, ConcatenateTables)
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 2cd7dc1574c..047b096a283 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -1011,5 +1011,16 @@ std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(c
   return {std::move(host_data), bitmask_to_host(c)};
 }
 
+large_strings_enabler::large_strings_enabler(bool default_enable)
+{
+  default_enable ? enable() : disable();
+}
+
+large_strings_enabler::~large_strings_enabler() { disable(); }
+
+void large_strings_enabler::enable() { setenv("LIBCUDF_LARGE_STRINGS_ENABLED", "1", 1); }
+
+void large_strings_enabler::disable() { setenv("LIBCUDF_LARGE_STRINGS_ENABLED", "0", 1); }
+
 }  // namespace test
 }  // namespace cudf

From 0ed224d94a915eee4ce7cdc2d837c1be1c93afcc Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Thu, 4 Apr 2024 20:42:36 -0500
Subject: [PATCH 032/272] Support implicit array conversion with query-planning
 enabled (#15378)

when query-planning is enabled, implicit conversion is not yet supported from a cudf-backed collection to a dask array. [Some cuml + dask CI failures are related to this limitation](https://github.com/rapidsai/cuml/pull/5815#issuecomment-2011030249). This PR adds basic support for implicit conversion.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15378
---
 .../dask_cudf/dask_cudf/expr/_collection.py   | 31 +++++++++++++++++
 python/dask_cudf/dask_cudf/tests/test_core.py | 34 +++++++++++++++++++
 2 files changed, 65 insertions(+)

diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py
index b2f92aeddda..799e6eddab3 100644
--- a/python/dask_cudf/dask_cudf/expr/_collection.py
+++ b/python/dask_cudf/dask_cudf/expr/_collection.py
@@ -108,3 +108,34 @@ class Index(DXIndex):
 get_collection_type.register(cudf.DataFrame, lambda _: DataFrame)
 get_collection_type.register(cudf.Series, lambda _: Series)
 get_collection_type.register(cudf.BaseIndex, lambda _: Index)
+
+
+##
+## Support conversion to GPU-backed Array collections
+##
+
+
+try:
+    from dask_expr._backends import create_array_collection
+
+    @get_collection_type.register_lazy("cupy")
+    def _register_cupy():
+        import cupy
+
+        @get_collection_type.register(cupy.ndarray)
+        def get_collection_type_cupy_array(_):
+            return create_array_collection
+
+    @get_collection_type.register_lazy("cupyx")
+    def _register_cupyx():
+        # Needed for cuml
+        from cupyx.scipy.sparse import spmatrix
+
+        @get_collection_type.register(spmatrix)
+        def get_collection_type_csr_matrix(_):
+            return create_array_collection
+
+except ImportError:
+    # Older version of dask-expr.
+    # Implicit conversion to array wont work.
+    pass
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 8a2f3414fd1..c6918c94559 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -913,3 +913,37 @@ def test_categorical_dtype_round_trip():
     actual = ds.compute()
     expected = pds.compute()
     assert actual.dtype.ordered == expected.dtype.ordered
+
+
+def test_implicit_array_conversion_cupy():
+    s = cudf.Series(range(10))
+    ds = dask_cudf.from_cudf(s, npartitions=2)
+
+    def func(x):
+        return x.values
+
+    # Need to compute the dask collection for now.
+    # See: https://github.com/dask/dask/issues/11017
+    result = ds.map_partitions(func, meta=s.values).compute()
+    expect = func(s)
+
+    dask.array.assert_eq(result, expect)
+
+
+def test_implicit_array_conversion_cupy_sparse():
+    cupyx = pytest.importorskip("cupyx")
+
+    s = cudf.Series(range(10), dtype="float32")
+    ds = dask_cudf.from_cudf(s, npartitions=2)
+
+    def func(x):
+        return cupyx.scipy.sparse.csr_matrix(x.values)
+
+    # Need to compute the dask collection for now.
+    # See: https://github.com/dask/dask/issues/11017
+    result = ds.map_partitions(func, meta=s.values).compute()
+    expect = func(s)
+
+    # NOTE: The calculation here doesn't need to make sense.
+    # We just need to make sure we get the right type back.
+    assert type(result) == type(expect)

From a00c3c916947d16fbf997095a32a02ca510b78e5 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 4 Apr 2024 22:20:32 -1000
Subject: [PATCH 033/272] Cleanup some timedelta/datetime column logic (#14715)

Remove private `_time_unit` attribute in favor of the public one and perform dtype validation earlier in `__init__`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14715
---
 python/cudf/cudf/core/_internals/timezones.py |  6 +--
 python/cudf/cudf/core/column/column.py        |  8 +--
 python/cudf/cudf/core/column/datetime.py      | 35 ++++++++-----
 python/cudf/cudf/core/column/timedelta.py     | 49 ++++++++-----------
 .../cudf/tests/series/test_datetimelike.py    | 15 ++++++
 5 files changed, 61 insertions(+), 52 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py
index 4e2fad08d56..4888cdd9ac9 100644
--- a/python/cudf/cudf/core/_internals/timezones.py
+++ b/python/cudf/cudf/core/_internals/timezones.py
@@ -114,7 +114,7 @@ def _find_ambiguous_and_nonexistent(
     tz_data_for_zone = get_tz_data(zone_name)
     transition_times = tz_data_for_zone["transition_times"]
     offsets = tz_data_for_zone["offsets"].astype(
-        f"timedelta64[{data._time_unit}]"
+        f"timedelta64[{data.time_unit}]"
     )
 
     if len(offsets) == 1:  # no transitions
@@ -183,7 +183,7 @@ def localize(
             "Already localized. "
             "Use `tz_convert` to convert between time zones."
         )
-    dtype = pd.DatetimeTZDtype(data._time_unit, zone_name)
+    dtype = pd.DatetimeTZDtype(data.time_unit, zone_name)
     ambiguous, nonexistent = _find_ambiguous_and_nonexistent(data, zone_name)
     localized = cast(
         DatetimeColumn,
@@ -230,7 +230,7 @@ def convert(data: DatetimeTZColumn, zone_name: str) -> DatetimeTZColumn:
         DatetimeTZColumn,
         build_column(
             data=utc_time.base_data,
-            dtype=pd.DatetimeTZDtype(data._time_unit, zone_name),
+            dtype=pd.DatetimeTZDtype(data.time_unit, zone_name),
             mask=utc_time.base_mask,
             size=utc_time.size,
             offset=utc_time.offset,
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 2541e076250..835da36fbfd 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -537,13 +537,7 @@ def element_indexing(self, index: int):
             idx = len(self) + idx
         if idx > len(self) - 1 or idx < 0:
             raise IndexError("single positional indexer is out-of-bounds")
-        result = libcudf.copying.get_element(self, idx).value
-        if cudf.get_option("mode.pandas_compatible"):
-            if isinstance(result, np.datetime64):
-                return pd.Timestamp(result)
-            elif isinstance(result, np.timedelta64):
-                return pd.Timedelta(result)
-        return result
+        return libcudf.copying.get_element(self, idx).value
 
     def slice(
         self, start: int, stop: int, stride: Optional[int] = None
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 9a5d9dcd47a..b84c1dc7ccd 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import datetime
+import functools
 import locale
 import re
 from locale import nl_langinfo
@@ -241,6 +242,8 @@ def __init__(
         null_count: Optional[int] = None,
     ):
         dtype = cudf.dtype(dtype)
+        if dtype.kind != "M":
+            raise TypeError(f"{self.dtype} is not a supported datetime type")
 
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
@@ -256,26 +259,26 @@ def __init__(
             null_count=null_count,
         )
 
-        if self.dtype.type is not np.datetime64:
-            raise TypeError(f"{self.dtype} is not a supported datetime type")
-
-        self._time_unit, _ = np.datetime_data(self.dtype)
-
     def __contains__(self, item: ScalarLike) -> bool:
         try:
-            item_as_dt64 = np.datetime64(item, self._time_unit)
-        except ValueError:
-            # If item cannot be converted to datetime type
-            # np.datetime64 raises ValueError, hence `item`
-            # cannot exist in `self`.
+            ts = pd.Timestamp(item).as_unit(self.time_unit)
+        except Exception:
+            # pandas can raise a variety of errors
+            # item cannot exist in self.
             return False
-        return item_as_dt64.astype("int64") in self.as_numerical_column(
+        if ts.tzinfo is None and isinstance(self.dtype, pd.DatetimeTZDtype):
+            return False
+        elif ts.tzinfo is not None:
+            ts = ts.tz_convert(None)
+        return ts.to_numpy().astype("int64") in self.as_numerical_column(
             "int64"
         )
 
-    @property
+    @functools.cached_property
     def time_unit(self) -> str:
-        return self._time_unit
+        if isinstance(self.dtype, pd.DatetimeTZDtype):
+            return self.dtype.unit
+        return np.datetime_data(self.dtype)[0]
 
     @property
     def year(self) -> ColumnBase:
@@ -322,6 +325,12 @@ def values(self):
             "DateTime Arrays is not yet implemented in cudf"
         )
 
+    def element_indexing(self, index: int):
+        result = super().element_indexing(index)
+        if cudf.get_option("mode.pandas_compatible"):
+            return pd.Timestamp(result)
+        return result
+
     def get_dt_field(self, field: str) -> ColumnBase:
         return libcudf.datetime.extract_datetime_component(self, field)
 
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 0d24e8e5120..c5ed889b5dc 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import datetime
+import functools
 from typing import Any, Optional, Sequence, cast
 
 import numpy as np
@@ -19,13 +20,6 @@
 from cudf.utils.dtypes import np_to_pa_dtype
 from cudf.utils.utils import _all_bools_with_nulls
 
-_dtype_to_format_conversion = {
-    "timedelta64[ns]": "%D days %H:%M:%S",
-    "timedelta64[us]": "%D days %H:%M:%S",
-    "timedelta64[ms]": "%D days %H:%M:%S",
-    "timedelta64[s]": "%D days %H:%M:%S",
-}
-
 _unit_to_nanoseconds_conversion = {
     "ns": 1,
     "us": 1_000,
@@ -87,6 +81,8 @@ def __init__(
         null_count: Optional[int] = None,
     ):
         dtype = cudf.dtype(dtype)
+        if dtype.kind != "m":
+            raise TypeError(f"{self.dtype} is not a supported duration type")
 
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
@@ -102,14 +98,9 @@ def __init__(
             null_count=null_count,
         )
 
-        if self.dtype.type is not np.timedelta64:
-            raise TypeError(f"{self.dtype} is not a supported duration type")
-
-        self._time_unit, _ = np.datetime_data(self.dtype)
-
     def __contains__(self, item: DatetimeLikeScalar) -> bool:
         try:
-            item = np.timedelta64(item, self._time_unit)
+            item = np.timedelta64(item, self.time_unit)
         except ValueError:
             # If item cannot be converted to duration type
             # np.timedelta64 raises ValueError, hence `item`
@@ -126,6 +117,12 @@ def values(self):
             "TimeDelta Arrays is not yet implemented in cudf"
         )
 
+    def element_indexing(self, index: int):
+        result = super().element_indexing(index)
+        if cudf.get_option("mode.pandas_compatible"):
+            return pd.Timedelta(result)
+        return result
+
     @acquire_spill_lock()
     def to_arrow(self) -> pa.Array:
         mask = None
@@ -219,16 +216,12 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand:
             "Cannot perform binary operation on timezone-naive columns"
             " and timezone-aware timestamps."
         )
-        if isinstance(other, pd.Timestamp):
-            if other.tz is not None:
+        if isinstance(other, datetime.datetime):
+            if other.tzinfo is not None:
                 raise NotImplementedError(tz_error_msg)
-            other = other.to_datetime64()
-        elif isinstance(other, pd.Timedelta):
-            other = other.to_timedelta64()
+            other = pd.Timestamp(other).to_datetime64()
         elif isinstance(other, datetime.timedelta):
-            other = np.timedelta64(other)
-        elif isinstance(other, datetime.datetime) and other.tzinfo is not None:
-            raise NotImplementedError(tz_error_msg)
+            other = pd.Timedelta(other).to_timedelta64()
 
         if isinstance(other, np.timedelta64):
             other_time_unit = cudf.utils.dtypes.get_time_unit(other)
@@ -245,13 +238,13 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand:
             else:
                 common_dtype = determine_out_dtype(self.dtype, other.dtype)
             return cudf.Scalar(other.astype(common_dtype))
-        elif np.isscalar(other):
+        elif is_scalar(other):
             return cudf.Scalar(other)
         return NotImplemented
 
-    @property
+    @functools.cached_property
     def time_unit(self) -> str:
-        return self._time_unit
+        return np.datetime_data(self.dtype)[0]
 
     def fillna(
         self,
@@ -292,9 +285,7 @@ def as_string_column(
         self, dtype: Dtype, format: str | None = None
     ) -> "cudf.core.column.StringColumn":
         if format is None:
-            format = _dtype_to_format_conversion.get(
-                self.dtype.name, "%D days %H:%M:%S"
-            )
+            format = "%D days %H:%M:%S"
         if len(self) > 0:
             return string._timedelta_to_str_typecast_functions[
                 cudf.dtype(self.dtype)
@@ -479,7 +470,7 @@ def components(self, index=None) -> "cudf.DataFrame":
                     _unit_to_nanoseconds_conversion[value[1]], "ns"
                 ).astype(self.dtype)
             )
-            if self._time_unit == value[1]:
+            if self.time_unit == value[1]:
                 break
 
         for name in keys_list:
@@ -571,7 +562,7 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn":
         # performing division operation to extract the number
         # of nanoseconds.
 
-        if self._time_unit != "ns":
+        if self.time_unit != "ns":
             res_col = column.as_column(0, length=len(self), dtype="int64")
             if self.nullable:
                 res_col = res_col.set_mask(self.mask)
diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py
index 98be7045923..6ee339ee3ea 100644
--- a/python/cudf/cudf/tests/series/test_datetimelike.py
+++ b/python/cudf/cudf/tests/series/test_datetimelike.py
@@ -203,3 +203,18 @@ def test_tz_aware_attributes_local():
     result = dti.hour
     expected = cudf.Index([9, 9, 9], dtype="int16")
     assert_eq(result, expected)
+
+
+@pytest.mark.parametrize(
+    "item, expected",
+    [
+        ["2020-01-01", False],
+        ["2020-01-01T00:00:00+00:00", True],
+        ["2020-01-01T00:00:00-08:00", False],
+        ["2019-12-31T16:00:00-08:00", True],
+    ],
+)
+def test_contains_tz_aware(item, expected):
+    dti = cudf.date_range("2020", periods=2, freq="D").tz_localize("UTC")
+    result = item in dti
+    assert result == expected

From 9ae32fef59172bf5901e14553b106cf840d524c6 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 5 Apr 2024 11:59:05 -0400
Subject: [PATCH 034/272] Fix debug build errors from to_arrow_device_test.cpp
 (#15463)

Fixes debug build failures resulting from changes from #15047. Here are some of the errors reported by the compiler:
```
Building CXX object tests/CMakeFiles/INTEROP_TEST.dir/interop/to_arrow_device_test.cpp.o
FAILED: tests/CMakeFiles/INTEROP_TEST.dir/interop/to_arrow_device_test.cpp.o
/usr/local/bin/g++ -DFMT_HEADER_ONLY=1 -DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE -DNANOARROW_DEBUG -DSPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_INFO -DSPDLOG_FMT_EXTERNAL -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA -DTHRUST_DISABLE_ABI_NAMESPACE -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP -DTHRUST_IGNORE_ABI_NAMESPACE_ERROR -I/cudf/cpp -I/cudf/cpp/src -I/cudf/cpp/build/_deps/dlpack-src/include -I/cudf/cpp/build/_deps/jitify-src -I/cudf/cpp/include -I/cudf/cpp/build/include -I/cudf/cpp/build/_deps/cccl-src/thrust/thrust/cmake/../.. -I/cudf/cpp/build/_deps/cccl-src/libcudacxx/lib/cmake/libcudacxx/../../../include -I/cudf/cpp/build/_deps/cccl-src/cub/cub/cmake/../.. -I/cudf/cpp/build/_deps/nvtx3-src/c/include -I/cudf/cpp/build/_deps/nanoarrow-src/src -I/cudf/cpp/build/_deps/nanoarrow-build/generated -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /conda/envs/rapids/include -fdiagnostics-color=always  -I/conda/envs/rapids/targets/x86_64-linux/include  -L/conda/envs/rapids/targets/x86_64-linux/lib -L/conda/envs/rapids/targets/x86_64-linux/lib/stubs -g -std=gnu++17 -fPIE -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations -pthread -MD -MT tests/CMakeFiles/INTEROP_TEST.dir/interop/to_arrow_device_test.cpp.o -MF tests/CMakeFiles/INTEROP_TEST.dir/interop/to_arrow_device_test.cpp.o.d -o tests/CMakeFiles/INTEROP_TEST.dir/interop/to_arrow_device_test.cpp.o -c /cudf/cpp/tests/interop/to_arrow_device_test.cpp
In file included from /cudf/cpp/tests/interop/to_arrow_device_test.cpp:17:
/cudf/cpp/tests/interop/nanoarrow_utils.hpp: In function 'void populate_list_from_col(ArrowArray*, cudf::lists_column_view)':
/cudf/cpp/tests/interop/nanoarrow_utils.hpp:220:26: error: ignoring return value of 'ArrowErrorCode ArrowBufferSetAllocator(ArrowBuffer*, ArrowBufferAllocator)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  220 |   ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
      |   ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/cudf/cpp/tests/interop/nanoarrow_utils.hpp:224:26: error: ignoring return value of 'ArrowErrorCode ArrowBufferSetAllocator(ArrowBuffer*, ArrowBufferAllocator)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  224 |   ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
      |   ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/cudf/cpp/tests/interop/to_arrow_device_test.cpp: In member function 'void BaseArrowFixture::compare_arrays(const ArrowSchema*, const ArrowArray*, const ArrowArray*)':
/cudf/cpp/tests/interop/to_arrow_device_test.cpp:268:24: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaViewInit(ArrowSchemaView*, const ArrowSchema*, ArrowError*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  268 |     ArrowSchemaViewInit(&schema_view, schema, nullptr);
/cudf/cpp/tests/interop/to_arrow_device_test.cpp: In member function 'virtual void ToArrowDeviceTest_DateTimeTable_Test::TestBody()':
/cudf/cpp/tests/interop/to_arrow_device_test.cpp:353:27: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetTypeStruct(ArrowSchema*, int64_t)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  353 |   ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
/cudf/cpp/tests/interop/to_arrow_device_test.cpp:355:29: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetTypeDateTime(ArrowSchema*, ArrowType, ArrowTimeUnit, const char*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]

(many more)
```
Warning are turned into errors so the build fails.
Fix simply adds the `NANOARROW_THROW_NOT_OK` to the offending calls.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/15463
---
 cpp/tests/interop/nanoarrow_utils.hpp      |  20 +--
 cpp/tests/interop/to_arrow_device_test.cpp | 140 ++++++++++++---------
 2 files changed, 88 insertions(+), 72 deletions(-)

diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp
index e7ffa9e40f4..c4b53282402 100644
--- a/cpp/tests/interop/nanoarrow_utils.hpp
+++ b/cpp/tests/interop/nanoarrow_utils.hpp
@@ -65,10 +65,10 @@ std::enable_if_t<cudf::is_fixed_width<T>() and !std::is_same_v<T, bool>, void> p
 {
   arr->length     = view.size();
   arr->null_count = view.null_count();
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
   ArrowArrayValidityBitmap(arr)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
   ArrowArrayBuffer(arr, 1)->data = const_cast<uint8_t*>(view.data<uint8_t>());
 }
 
@@ -109,20 +109,20 @@ std::enable_if_t<std::is_same_v<T, bool>, void> populate_from_col(ArrowArray* ar
 {
   arr->length     = view.size();
   arr->null_count = view.null_count();
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
   ArrowArrayValidityBitmap(arr)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
 
   auto bitmask = cudf::bools_to_mask(view);
   auto ptr     = reinterpret_cast<uint8_t*>(bitmask.first->data());
-  ArrowBufferSetAllocator(
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(
     ArrowArrayBuffer(arr, 1),
     ArrowBufferDeallocator(
       [](ArrowBufferAllocator* alloc, uint8_t*, int64_t) {
         auto buf = reinterpret_cast<std::unique_ptr<rmm::device_buffer>*>(alloc->private_data);
         delete buf;
       },
-      new std::unique_ptr<rmm::device_buffer>(std::move(bitmask.first))));
+      new std::unique_ptr<rmm::device_buffer>(std::move(bitmask.first)))));
   ArrowArrayBuffer(arr, 1)->data = ptr;
 }
 
@@ -160,14 +160,14 @@ std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> populate_from_col(
 {
   arr->length     = view.size();
   arr->null_count = view.null_count();
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
   ArrowArrayValidityBitmap(arr)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
 
   cudf::strings_column_view sview{view};
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
   ArrowArrayBuffer(arr, 1)->data = const_cast<uint8_t*>(sview.offsets().data<uint8_t>());
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 2), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 2), noop_alloc));
   ArrowArrayBuffer(arr, 2)->data = const_cast<uint8_t*>(view.data<uint8_t>());
 }
 
@@ -217,10 +217,10 @@ void populate_list_from_col(ArrowArray* arr, cudf::lists_column_view view)
   arr->length     = view.size();
   arr->null_count = view.null_count();
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
   ArrowArrayValidityBitmap(arr)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
   ArrowArrayBuffer(arr, 1)->data = const_cast<uint8_t*>(view.offsets().data<uint8_t>());
 }
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index 243aa4e81af..16aab53a249 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -265,7 +265,7 @@ struct BaseArrowFixture : public cudf::test::BaseFixture {
                       const ArrowArray* actual)
   {
     ArrowSchemaView schema_view;
-    ArrowSchemaViewInit(&schema_view, schema, nullptr);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, nullptr));
 
     EXPECT_EQ(expected->length, actual->length);
     EXPECT_EQ(expected->null_count, actual->null_count);
@@ -350,11 +350,11 @@ TEST_F(ToArrowDeviceTest, DateTimeTable)
     cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
   nanoarrow::UniqueSchema expected_schema;
   ArrowSchemaInit(expected_schema.get());
-  ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
   ArrowSchemaInit(expected_schema->children[0]);
-  ArrowSchemaSetTypeDateTime(
-    expected_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr);
-  ArrowSchemaSetName(expected_schema->children[0], "a");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(
+    expected_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
   expected_schema->children[0]->flags = 0;
 
   compare_schemas(expected_schema.get(), got_arrow_schema.get());
@@ -395,7 +395,7 @@ TYPED_TEST(ToArrowDeviceTestDurationsTest, DurationTable)
 
   nanoarrow::UniqueSchema expected_schema;
   ArrowSchemaInit(expected_schema.get());
-  ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
 
   ArrowSchemaInit(expected_schema->children[0]);
   const ArrowTimeUnit arrow_unit = [&] {
@@ -407,9 +407,9 @@ TYPED_TEST(ToArrowDeviceTestDurationsTest, DurationTable)
       default: CUDF_FAIL("Unsupported duration unit in arrow");
     }
   }();
-  ArrowSchemaSetTypeDateTime(
-    expected_schema->children[0], NANOARROW_TYPE_DURATION, arrow_unit, nullptr);
-  ArrowSchemaSetName(expected_schema->children[0], "a");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(
+    expected_schema->children[0], NANOARROW_TYPE_DURATION, arrow_unit, nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
   expected_schema->children[0]->flags = 0;
 
   auto got_arrow_schema =
@@ -450,19 +450,22 @@ TEST_F(ToArrowDeviceTest, NestedList)
 
   nanoarrow::UniqueSchema expected_schema;
   ArrowSchemaInit(expected_schema.get());
-  ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
 
-  ArrowSchemaInitFromType(expected_schema->children[0], NANOARROW_TYPE_LIST);
-  ArrowSchemaSetName(expected_schema->children[0], "a");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(expected_schema->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
   expected_schema->children[0]->flags = ARROW_FLAG_NULLABLE;
 
-  ArrowSchemaInitFromType(expected_schema->children[0]->children[0], NANOARROW_TYPE_LIST);
-  ArrowSchemaSetName(expected_schema->children[0]->children[0], "element");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(expected_schema->children[0]->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0]->children[0], "element"));
   expected_schema->children[0]->children[0]->flags = 0;
 
-  ArrowSchemaInitFromType(expected_schema->children[0]->children[0]->children[0],
-                          NANOARROW_TYPE_INT64);
-  ArrowSchemaSetName(expected_schema->children[0]->children[0]->children[0], "element");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(
+    expected_schema->children[0]->children[0]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaSetName(expected_schema->children[0]->children[0]->children[0], "element"));
   expected_schema->children[0]->children[0]->children[0]->flags = ARROW_FLAG_NULLABLE;
 
   auto got_arrow_schema =
@@ -481,7 +484,8 @@ TEST_F(ToArrowDeviceTest, NestedList)
   populate_list_from_col(top_list->children[0], nested_view);
   populate_from_col<int64_t>(top_list->children[0]->children[0], nested_view.child());
 
-  ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
   auto got_arrow_array = cudf::to_arrow_device(std::move(input));
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
@@ -537,44 +541,49 @@ TEST_F(ToArrowDeviceTest, StructColumn)
 
   nanoarrow::UniqueSchema expected_schema;
   ArrowSchemaInit(expected_schema.get());
-  ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
 
   ArrowSchemaInit(expected_schema->children[0]);
-  ArrowSchemaSetTypeStruct(expected_schema->children[0], 5);
-  ArrowSchemaSetName(expected_schema->children[0], "a");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema->children[0], 5));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
   expected_schema->children[0]->flags = 0;
 
   auto child = expected_schema->children[0];
-  ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING);
-  ArrowSchemaSetName(child->children[0], "string");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[0], "string"));
   child->children[0]->flags = 0;
 
-  ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32);
-  ArrowSchemaSetName(child->children[1], "integral");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[1], "integral"));
   child->children[1]->flags = 0;
 
-  ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL);
-  ArrowSchemaSetName(child->children[2], "bool");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[2], "bool"));
   child->children[2]->flags = 0;
 
-  ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST);
-  ArrowSchemaSetName(child->children[3], "nested_list");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3], "nested_list"));
   child->children[3]->flags = 0;
-  ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST);
-  ArrowSchemaSetName(child->children[3]->children[0], "element");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3]->children[0], "element"));
   child->children[3]->children[0]->flags = 0;
-  ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64);
-  ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element"));
   child->children[3]->children[0]->children[0]->flags = 0;
 
   ArrowSchemaInit(child->children[4]);
-  ArrowSchemaSetTypeStruct(child->children[4], 2);
-  ArrowSchemaSetName(child->children[4], "struct");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(child->children[4], 2));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4], "struct"));
 
-  ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING);
-  ArrowSchemaSetName(child->children[4]->children[0], "string2");
-  ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32);
-  ArrowSchemaSetName(child->children[4]->children[1], "integral2");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[0], "string2"));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[1], "integral2"));
 
   auto got_arrow_schema =
     cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{metadata});
@@ -582,7 +591,8 @@ TEST_F(ToArrowDeviceTest, StructColumn)
   ArrowSchemaRelease(got_arrow_schema.get());
 
   nanoarrow::UniqueArray expected_array;
-  ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr);
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
 
   expected_array->length = input.num_rows();
 
@@ -591,7 +601,7 @@ TEST_F(ToArrowDeviceTest, StructColumn)
   array_a->length     = view_a.size();
   array_a->null_count = view_a.null_count();
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(array_a, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(array_a, 0), noop_alloc));
   ArrowArrayValidityBitmap(array_a)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_a.null_mask()));
 
@@ -609,14 +619,15 @@ TEST_F(ToArrowDeviceTest, StructColumn)
   array_struct->length     = view_struct.size();
   array_struct->null_count = view_struct.null_count();
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(array_struct, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(array_struct, 0), noop_alloc));
   ArrowArrayValidityBitmap(array_struct)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_struct.null_mask()));
 
   populate_from_col<cudf::string_view>(array_struct->children[0], view_struct.child(0));
   populate_from_col<int32_t>(array_struct->children[1], view_struct.child(1));
 
-  ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
   auto got_arrow_array = cudf::to_arrow_device(std::move(input));
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
@@ -642,13 +653,13 @@ TEST_F(ToArrowDeviceTest, FixedPoint64Table)
 
     nanoarrow::UniqueSchema expected_schema;
     ArrowSchemaInit(expected_schema.get());
-    ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
     ArrowSchemaInit(expected_schema->children[0]);
-    ArrowSchemaSetTypeDecimal(expected_schema->children[0],
-                              NANOARROW_TYPE_DECIMAL128,
-                              cudf::detail::max_precision<int64_t>(),
-                              -scale);
-    ArrowSchemaSetName(expected_schema->children[0], "a");
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int64_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
     expected_schema->children[0]->flags = 0;
 
     auto got_arrow_schema =
@@ -665,16 +676,18 @@ TEST_F(ToArrowDeviceTest, FixedPoint64Table)
 
     cudf::get_default_stream().synchronize();
     nanoarrow::UniqueArray expected_array;
-    ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
     expected_array->length = input.num_rows();
 
     expected_array->children[0]->length = input.num_rows();
-    ArrowBufferSetAllocator(ArrowArrayBuffer(expected_array->children[0], 0), noop_alloc);
+    NANOARROW_THROW_NOT_OK(
+      ArrowBufferSetAllocator(ArrowArrayBuffer(expected_array->children[0], 0), noop_alloc));
     ArrowArrayValidityBitmap(expected_array->children[0])->buffer.data =
       const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(input.view().column(0).null_mask()));
 
     auto data_ptr = reinterpret_cast<uint8_t*>(result_dev_data->data());
-    ArrowBufferSetAllocator(
+    NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(
       ArrowArrayBuffer(expected_array->children[0], 1),
       ArrowBufferDeallocator(
         [](ArrowBufferAllocator* alloc, uint8_t*, int64_t) {
@@ -682,9 +695,10 @@ TEST_F(ToArrowDeviceTest, FixedPoint64Table)
             reinterpret_cast<std::unique_ptr<rmm::device_uvector<int64_t>>*>(alloc->private_data);
           delete buf;
         },
-        new std::unique_ptr<rmm::device_uvector<int64_t>>(std::move(result_dev_data))));
+        new std::unique_ptr<rmm::device_uvector<int64_t>>(std::move(result_dev_data)))));
     ArrowArrayBuffer(expected_array->children[0], 1)->data = data_ptr;
-    ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
     auto got_arrow_array = cudf::to_arrow_device(std::move(input));
     ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
@@ -708,13 +722,13 @@ TEST_F(ToArrowDeviceTest, FixedPoint128Table)
 
     nanoarrow::UniqueSchema expected_schema;
     ArrowSchemaInit(expected_schema.get());
-    ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
     ArrowSchemaInit(expected_schema->children[0]);
-    ArrowSchemaSetTypeDecimal(expected_schema->children[0],
-                              NANOARROW_TYPE_DECIMAL128,
-                              cudf::detail::max_precision<__int128_t>(),
-                              -scale);
-    ArrowSchemaSetName(expected_schema->children[0], "a");
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
     expected_schema->children[0]->flags = 0;
 
     auto got_arrow_schema =
@@ -723,11 +737,13 @@ TEST_F(ToArrowDeviceTest, FixedPoint128Table)
     ArrowSchemaRelease(got_arrow_schema.get());
 
     nanoarrow::UniqueArray expected_array;
-    ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
     expected_array->length = input.num_rows();
 
     populate_from_col<__int128_t>(expected_array->children[0], input.view().column(0));
-    ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
     auto got_arrow_array = cudf::to_arrow_device(std::move(input));
     EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);

From 363db505e46970668207e6d28f22653a831cc3d5 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 5 Apr 2024 08:47:24 -1000
Subject: [PATCH 035/272] Use cached_property for NumericColumn.nan_count
 instead of ._nan_count variable (#15466)

Small cleanup that results in the same functionality

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15466
---
 python/cudf/cudf/core/column/numerical.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index b2bd73c9856..f42c87de3fd 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import functools
 from typing import (
     Any,
     Callable,
@@ -75,7 +76,6 @@ class NumericalColumn(NumericalBaseColumn):
     mask : Buffer, optional
     """
 
-    _nan_count: Optional[int]
     _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS
 
     def __init__(
@@ -93,7 +93,6 @@ def __init__(
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
             size = (data.size // dtype.itemsize) - offset
-        self._nan_count = None
         super().__init__(
             data,
             size=size,
@@ -105,7 +104,10 @@ def __init__(
 
     def _clear_cache(self):
         super()._clear_cache()
-        self._nan_count = None
+        try:
+            del self.nan_count
+        except AttributeError:
+            pass
 
     def __contains__(self, item: ScalarLike) -> bool:
         """
@@ -424,14 +426,12 @@ def any(self, skipna: bool = True) -> bool:
 
         return libcudf.reduce.reduce("any", result_col, dtype=np.bool_)
 
-    @property
+    @functools.cached_property
     def nan_count(self) -> int:
         if self.dtype.kind != "f":
-            self._nan_count = 0
-        elif self._nan_count is None:
-            nan_col = libcudf.unary.is_nan(self)
-            self._nan_count = nan_col.sum()
-        return self._nan_count
+            return 0
+        nan_col = libcudf.unary.is_nan(self)
+        return nan_col.sum()
 
     def _process_values_for_isin(
         self, values: Sequence

From 6319ab708f2dff9fd7a62a5c77fd3b387bde1bb8 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 5 Apr 2024 14:30:26 -0500
Subject: [PATCH 036/272] Enable all tests for `arm` arch (#15402)

This PR enables running all pytests for `arm64` jobs.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Jake Awe (https://github.com/AyodeAwe)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15402
---
 ci/test_wheel_cudf.sh       | 39 ++++++++++++++++---------------------
 ci/wheel_smoke_test_cudf.py | 13 -------------
 2 files changed, 17 insertions(+), 35 deletions(-)
 delete mode 100644 ci/wheel_smoke_test_cudf.py

diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index a6f122491b0..fdb61278d36 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -13,26 +13,21 @@ RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"
 
-# Run smoke tests for aarch64 pull requests
-if [[ "$(arch)" == "aarch64" && ${RAPIDS_BUILD_TYPE} == "pull-request" ]]; then
-    rapids-logger "Run smoke tests for cudf"
-    python ./ci/wheel_smoke_test_cudf.py
-else
-    rapids-logger "pytest pylibcudf"
-    pushd python/cudf/cudf/pylibcudf_tests
-    python -m pytest \
-      --cache-clear \
-      --dist=worksteal \
-      .
-    popd
 
-    rapids-logger "pytest cudf"
-    pushd python/cudf/cudf/tests
-    python -m pytest \
-      --cache-clear \
-      --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \
-      --numprocesses=8 \
-      --dist=worksteal \
-      .
-    popd
-fi
+rapids-logger "pytest pylibcudf"
+pushd python/cudf/cudf/pylibcudf_tests
+python -m pytest \
+  --cache-clear \
+  --dist=worksteal \
+  .
+popd
+
+rapids-logger "pytest cudf"
+pushd python/cudf/cudf/tests
+python -m pytest \
+  --cache-clear \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \
+  --numprocesses=8 \
+  --dist=worksteal \
+  .
+popd
diff --git a/ci/wheel_smoke_test_cudf.py b/ci/wheel_smoke_test_cudf.py
deleted file mode 100644
index a11a97039af..00000000000
--- a/ci/wheel_smoke_test_cudf.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-
-import cudf
-import pyarrow as pa
-
-if __name__ == '__main__':
-    n_legs = pa.array([2, 4, 5, 100])
-    animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"])
-    names = ["n_legs", "animals"]
-    foo = pa.table([n_legs, animals], names=names)
-    df = cudf.DataFrame.from_arrow(foo)
-    assert df.loc[df["animals"] == "Centipede"]["n_legs"].iloc[0] == 100
-    assert df.loc[df["animals"] == "Flamingo"]["n_legs"].iloc[0] == 2

From 4b951ef093a7cf0ff8da3fa3c0f1c87ef719ba5c Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 5 Apr 2024 16:23:24 -0500
Subject: [PATCH 037/272] Add custom status check workflow (#15464)

This PR adds a custom workflow that creates a custom github status check to `cudf` that will run after `workflow_run` event is triggered.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15464
---
 .github/workflows/status.yaml | 115 ++++++++++++++++++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 .github/workflows/status.yaml

diff --git a/.github/workflows/status.yaml b/.github/workflows/status.yaml
new file mode 100644
index 00000000000..0aad4c8a23e
--- /dev/null
+++ b/.github/workflows/status.yaml
@@ -0,0 +1,115 @@
+name: Custom GH Status from Workflow Artifacts
+
+on:
+  workflow_run:
+    workflows: ["pr"]
+    types:
+      - completed
+
+jobs:
+  process_artifacts:
+    if: ${{ github.event.workflow_run.conclusion == 'success' }}
+    runs-on: ubuntu-latest
+    outputs:
+      artifact_downloaded: ${{ steps.download_artifact.outputs.artifact_downloaded }}
+    permissions:
+      actions: read
+      checks: read
+      contents: read
+      deployments: read
+      id-token: write
+      issues: read
+      discussions: read
+      packages: read
+      pages: read
+      pull-requests: read
+      repository-projects: read
+      security-events: read
+      statuses: write
+    steps:
+      - name: Download artifact
+        id: download_artifact
+        uses: actions/github-script@v7
+        with:
+          retries: 3
+          script: |
+            const fs = require('fs');
+            const path = require('path');
+            const artifactName = 'gh-status';
+
+            const allArtifacts = await github.rest.actions.listWorkflowRunArtifacts({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                run_id: context.payload.workflow_run.id,
+              });
+            // Find the specific artifact
+            const artifact = allArtifacts.data.artifacts.find(artifact => artifact.name === artifactName);
+            if (!artifact) {
+              core.info(`Artifact "${artifactName}" not found. Exiting safely.`);
+              core.setOutput('artifact_downloaded', 'false');
+              return;
+            }
+            core.setOutput('artifact_downloaded', 'true');
+            // Download the artifact
+            const download = await github.rest.actions.downloadArtifact({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              artifact_id: artifact.id,
+              archive_format: 'zip',
+            });
+
+            // Write the artifact to a file
+            fs.writeFileSync(`${process.env.GITHUB_WORKSPACE}/${artifactName}.zip`, Buffer.from(download.data));
+
+      - name: 'Unzip artifact'
+        if: ${{ steps.download_artifact.outputs.artifact_downloaded == 'true' }}
+        run: unzip 'gh-status.zip'
+
+      - name: Create status
+        if: ${{ steps.download_artifact.outputs.artifact_downloaded == 'true' }}
+        uses: actions/github-script@v7
+        env:
+          WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
+          COMMIT_SHA: ${{ github.event.workflow_run.head_sha }}
+          ATTEMPTS: ${{ github.event.workflow_run.run_attempt }}
+        with:
+          retries: 3
+          script: |
+            // Load the JSON content
+            const contentJSON = require('./gh-status.json');
+            const {
+                job_name: JOB_NAME,
+                context: CUSTOM_CONTEXT = 'Custom CI Status Check',
+                description: CUSTOM_DESCRIPTION = 'Custom CI Status description',
+                target_url: CUSTOM_TARGET_URL,
+                state: CUSTOM_STATE = 'success'
+            } = contentJSON;
+
+            // Fetch the first job ID from the workflow run
+            const jobs = await github.rest.actions.listJobsForWorkflowRun({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                run_id: process.env.WORKFLOW_RUN_ID,
+            });
+            const job = jobs.data.jobs.find(job => job.name === JOB_NAME);
+            const JOB_ID = job ? job.id : null;
+
+            // Set default target URL if not defined
+            const targetUrl = CUSTOM_TARGET_URL || `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${process.env.WORKFLOW_RUN_ID}/attempts/${process.env.ATTEMPTS}#summary-${JOB_ID}`;
+
+            console.log("job id: ", JOB_ID);
+            console.log("state: ", CUSTOM_STATE);
+            console.log("target url: ", targetUrl);
+            console.log("description: ", CUSTOM_DESCRIPTION);
+            console.log("context: ", CUSTOM_CONTEXT);
+
+            // Create status
+            await github.rest.repos.createCommitStatus({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                sha: process.env.COMMIT_SHA,
+                state: CUSTOM_STATE,
+                target_url: targetUrl,
+                description: CUSTOM_DESCRIPTION,
+                context: CUSTOM_CONTEXT,
+            });

From c5eb3240387222373043ddf881d18fb5d18e0834 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 5 Apr 2024 13:01:10 -1000
Subject: [PATCH 038/272] Refactor numpy array input in as_column (#14651)

Simplifies the numpy array input logic to `as_column` to be

```
if object/string dtype like:
    # parse with pandas with inference
elif numeric-like dtype or datelike with nat:
    # parse with pyarrow (due to np.nan/np.nat/nan_is_null handling)
else:
    # create column from buffer
```

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14651
---
 python/cudf/cudf/core/column/column.py | 160 +++++++++----------------
 python/cudf/cudf/tests/test_column.py  |   2 +-
 python/cudf/cudf/tests/test_concat.py  |   2 +-
 python/cudf/cudf/tests/test_joining.py |   6 +-
 python/cudf/cudf/tests/test_series.py  |  20 ++--
 5 files changed, 72 insertions(+), 118 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 835da36fbfd..518513c66f0 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1702,6 +1702,14 @@ def _make_copy_replacing_NaT_with_null(column):
     return out_col
 
 
+def check_invalid_array(shape: tuple, dtype):
+    """Invalid ndarrays properties that are not supported"""
+    if len(shape) > 1:
+        raise ValueError("Data must be 1-dimensional")
+    elif dtype == "float16":
+        raise TypeError("Unsupported type float16")
+
+
 def as_memoryview(arbitrary: Any) -> Optional[memoryview]:
     try:
         return memoryview(arbitrary)
@@ -1777,12 +1785,9 @@ def as_column(
     elif hasattr(arbitrary, "__cuda_array_interface__"):
         desc = arbitrary.__cuda_array_interface__
         shape = desc["shape"]
-        if len(shape) > 1:
-            raise ValueError("Data must be 1-dimensional")
         current_dtype = np.dtype(desc["typestr"])
 
-        if current_dtype == "float16":
-            raise TypeError("Unsupported type float16")
+        check_invalid_array(shape, current_dtype)
 
         arb_dtype = cudf.dtype(current_dtype)
 
@@ -1962,7 +1967,7 @@ def as_column(
             inferred_dtype = infer_dtype(arbitrary)
             if inferred_dtype in ("mixed-integer", "mixed-integer-float"):
                 raise MixedTypeError("Cannot create column with mixed types")
-            elif inferred_dtype not in (
+            elif dtype is None and inferred_dtype not in (
                 "mixed",
                 "decimal",
                 "string",
@@ -2026,117 +2031,64 @@ def as_column(
             return ColumnBase.from_scalar(arbitrary, length)
 
     elif hasattr(arbitrary, "__array_interface__"):
-        # CUDF assumes values are always contiguous
         desc = arbitrary.__array_interface__
-        shape = desc["shape"]
-        arb_dtype = np.dtype(desc["typestr"])
+        check_invalid_array(desc["shape"], np.dtype(desc["typestr"]))
+
         # CUDF assumes values are always contiguous
-        if len(shape) > 1:
-            raise ValueError("Data must be 1-dimensional")
+        arbitrary = np.asarray(arbitrary, order="C")
 
-        arbitrary = np.asarray(arbitrary)
+        if arbitrary.ndim == 0:
+            # TODO: Or treat as scalar?
+            arbitrary = arbitrary[np.newaxis]
 
-        # Handle case that `arbitrary` elements are cupy arrays
-        if (
-            shape
-            and shape[0]
-            and hasattr(arbitrary[0], "__cuda_array_interface__")
-        ):
+        if arbitrary.dtype.kind in "OSU":
+            if pd.isna(arbitrary).any():
+                arbitrary = pa.array(arbitrary)
+            else:
+                # Let pandas potentially infer object type
+                # e.g. np.array([pd.Timestamp(...)], dtype=object) -> datetime64
+                arbitrary = pd.Series(arbitrary)
+            return as_column(arbitrary, dtype=dtype, nan_as_null=nan_as_null)
+        elif arbitrary.dtype.kind in "biuf":
+            from_pandas = nan_as_null is None or nan_as_null
             return as_column(
-                cupy.asarray(arbitrary, dtype=arbitrary[0].dtype),
-                nan_as_null=nan_as_null,
+                pa.array(arbitrary, from_pandas=from_pandas),
                 dtype=dtype,
-                length=length,
+                nan_as_null=nan_as_null,
             )
-
-        if not arbitrary.flags["C_CONTIGUOUS"]:
-            arbitrary = np.ascontiguousarray(arbitrary)
-
-        delayed_cast = False
-        if dtype is not None:
-            try:
-                dtype = np.dtype(dtype)
-            except TypeError:
-                # Some `dtype`'s can't be parsed by `np.dtype`
-                # for which we will have to cast after the column
-                # has been constructed.
-                delayed_cast = True
-            else:
-                arbitrary = arbitrary.astype(dtype)
-
-        if arb_dtype.kind == "M":
+        elif arbitrary.dtype.kind in "mM":
             time_unit = get_time_unit(arbitrary)
-            cast_dtype = time_unit in ("D", "W", "M", "Y")
-
-            if cast_dtype:
-                arbitrary = arbitrary.astype(cudf.dtype("datetime64[s]"))
+            if time_unit in ("D", "W", "M", "Y"):
+                # TODO: Raise in these cases instead of downcasting to s?
+                new_type = f"{arbitrary.dtype.type.__name__}[s]"
+                arbitrary = arbitrary.astype(new_type)
+            elif time_unit == "generic":
+                # TODO: This should probably be in cudf.dtype
+                raise TypeError(
+                    f"{arbitrary.dtype.type.__name__} must have a unit specified"
+                )
 
-            buffer = as_buffer(arbitrary.view("|u1"))
-            if nan_as_null is None or nan_as_null is True:
-                data = build_column(buffer, dtype=arbitrary.dtype)
-                data = _make_copy_replacing_NaT_with_null(data)
-                mask = data.mask
-            else:
-                bool_mask = as_column(~np.isnat(arbitrary))
+            is_nat = np.isnat(arbitrary)
+            mask = None
+            if is_nat.any():
+                if nan_as_null is None or nan_as_null:
+                    # Convert NaT to NA, which pyarrow does by default
+                    return as_column(
+                        pa.array(arbitrary),
+                        dtype=dtype,
+                        nan_as_null=nan_as_null,
+                    )
+                # Consider NaT as NA in the mask
+                # but maintain NaT as a value
+                bool_mask = as_column(~is_nat)
                 mask = as_buffer(bools_to_mask(bool_mask))
-
-            data = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype)
-        elif arb_dtype.kind == "m":
-            time_unit = get_time_unit(arbitrary)
-            cast_dtype = time_unit in ("D", "W", "M", "Y")
-
-            if cast_dtype:
-                arbitrary = arbitrary.astype(cudf.dtype("timedelta64[s]"))
-
             buffer = as_buffer(arbitrary.view("|u1"))
-            if nan_as_null is None or nan_as_null is True:
-                data = build_column(buffer, dtype=arbitrary.dtype)
-                data = _make_copy_replacing_NaT_with_null(data)
-                mask = data.mask
-            else:
-                bool_mask = as_column(~np.isnat(arbitrary))
-                mask = as_buffer(bools_to_mask(bool_mask))
-
-            data = cudf.core.column.timedelta.TimeDeltaColumn(
-                data=buffer,
-                size=len(arbitrary),
-                mask=mask,
-                dtype=arbitrary.dtype,
-            )
-        elif (
-            arbitrary.size != 0
-            and arb_dtype.kind in ("O")
-            and isinstance(arbitrary[0], pd.Interval)
-        ):
-            # changing from pd array to series,possible arrow bug
-            interval_series = pd.Series(arbitrary)
-            data = as_column(
-                pa.Array.from_pandas(interval_series),
-                dtype=arbitrary.dtype,
-            )
-            if dtype is not None:
-                data = data.astype(dtype)
-        elif arb_dtype.kind in ("O", "U"):
-            data = as_column(pa.array(arbitrary), dtype=dtype)
-            # There is no cast operation available for pa.Array from int to
-            # str, Hence instead of handling in pa.Array block, we
-            # will have to type-cast here.
-            if dtype is not None:
-                data = data.astype(dtype)
-        elif arb_dtype.kind in ("f"):
-            if arb_dtype == np.dtype("float16"):
-                raise TypeError("Unsupported type float16")
-            arb_dtype = cudf.dtype(arb_dtype if dtype is None else dtype)
-            data = as_column(
-                cupy.asarray(arbitrary, dtype=arb_dtype),
-                nan_as_null=nan_as_null,
-            )
+            col = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype)
+            if dtype:
+                col = col.astype(dtype)
+            return col
         else:
-            data = as_column(cupy.asarray(arbitrary), nan_as_null=nan_as_null)
-
-        if delayed_cast:
-            data = data.astype(cudf.dtype(dtype))
-
+            raise NotImplementedError(f"{arbitrary.dtype} not supported")
     elif (view := as_memoryview(arbitrary)) is not None:
         return as_column(
             np.asarray(view), dtype=dtype, nan_as_null=nan_as_null
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index 8e8555b2005..2f70f955fa9 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -176,7 +176,7 @@ def test_column_series_multi_dim(data):
 @pytest.mark.parametrize(
     ("data", "error"),
     [
-        ([1, "1.0", "2", -3], pa.lib.ArrowInvalid),
+        ([1, "1.0", "2", -3], cudf.errors.MixedTypeError),
         ([np.nan, 0, "null", cp.nan], pa.lib.ArrowInvalid),
         (
             [np.int32(4), np.float64(1.5), np.float32(1.290994), np.int8(0)],
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index cdb47ea79d8..3d638da924b 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -1705,7 +1705,7 @@ def test_concat_decimal_numeric_series(s1, s2, s3, expected):
             cudf.Series(
                 np.arange(
                     "2005-02-01T12", "2005-02-01T15", dtype="datetime64[h]"
-                ),
+                ).astype("datetime64[s]"),
                 dtype="datetime64[s]",
             ),
             cudf.Series(
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index c063043b72a..f36774daab2 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -1527,7 +1527,7 @@ def test_categorical_typecast_outer():
         result = left.merge(right, how="outer", on="key")
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["object"])
+@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str"])
 def test_categorical_typecast_inner_one_cat(dtype):
     data = np.array([1, 2, 3], dtype=dtype)
 
@@ -1538,7 +1538,7 @@ def test_categorical_typecast_inner_one_cat(dtype):
     assert result["key"].dtype == left["key"].dtype.categories.dtype
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["object"])
+@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str"])
 def test_categorical_typecast_left_one_cat(dtype):
     data = np.array([1, 2, 3], dtype=dtype)
 
@@ -1549,7 +1549,7 @@ def test_categorical_typecast_left_one_cat(dtype):
     assert result["key"].dtype == left["key"].dtype
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["object"])
+@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str"])
 def test_categorical_typecast_outer_one_cat(dtype):
     data = np.array([1, 2, 3], dtype=dtype)
 
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 48194494260..b45857e28ad 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2637,24 +2637,26 @@ def test_series_setitem_mixed_bool_dtype():
 @pytest.mark.parametrize(
     "nat, value",
     [
-        [np.datetime64("nat"), np.datetime64("2020-01-01")],
-        [np.timedelta64("nat"), np.timedelta64(1)],
+        [np.datetime64("nat", "ns"), np.datetime64("2020-01-01", "ns")],
+        [np.timedelta64("nat", "ns"), np.timedelta64(1, "ns")],
     ],
 )
 @pytest.mark.parametrize("nan_as_null", [True, False])
-def test_series_np_array_nat_nan_as_nulls(nat, value, request, nan_as_null):
+def test_series_np_array_nat_nan_as_nulls(nat, value, nan_as_null):
     expected = np.array([nat, value])
-    if expected.dtype.kind == "m":
-        request.applymarker(
-            pytest.mark.xfail(
-                raises=TypeError, reason="timedelta64 not supported by cupy"
-            )
-        )
     ser = cudf.Series(expected, nan_as_null=nan_as_null)
     assert ser[0] is pd.NaT
     assert ser[1] == value
 
 
+def test_series_unitness_np_datetimelike_units():
+    data = np.array([np.timedelta64(1)])
+    with pytest.raises(TypeError):
+        cudf.Series(data)
+    with pytest.raises(TypeError):
+        pd.Series(data)
+
+
 def test_series_duplicate_index_reindex():
     gs = cudf.Series([0, 1, 2, 3], index=[0, 0, 1, 1])
     ps = gs.to_pandas()

From 102d564db21df1d805c2d06571e75a96fa6d822f Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 8 Apr 2024 07:21:21 -0500
Subject: [PATCH 039/272] Enable test-reporting for pandas pytests in CI
 (#15369)

This PR enables pandas test-reporting for pandas pytests in CI by comparing against the results available in nightlies as a baseline.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15369
---
 .github/workflows/pr.yaml                     | 43 ++++---------------
 ci/cudf_pandas_scripts/pandas-tests/diff.sh   | 29 +++++++++----
 .../pandas-tests/job-summary.py               |  4 +-
 3 files changed, 32 insertions(+), 44 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 2d7ebb62fa8..345ccbea45b 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -30,8 +30,7 @@ jobs:
       - devcontainer
       - unit-tests-cudf-pandas
       - pandas-tests
-      #- pandas-tests-diff
-      #- pandas-tests-diff-comment
+      - pandas-tests-diff
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.06
   checks:
@@ -180,35 +179,11 @@ jobs:
       script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr
       # Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit.
       test_summary_show: "none"
-  #pandas-tests-diff:
-  #  # diff the results of running the Pandas unit tests and publish a job summary
-  #  needs: [pandas-tests-main, pandas-tests-pr]
-  #  secrets: inherit
-  #  # This branch exports a `job_output` output that the downstream job reads.
-  #  uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
-  #  with:
-  #    node_type: cpu4
-  #    build_type: pull-request
-  #    run_script: ci/cudf_pandas_scripts/pandas-tests/diff.sh
-  #pandas-tests-diff-comment:
-  #  # Post comment of pass/fail rate on PR
-  #  runs-on: ubuntu-latest
-  #  needs: pandas-tests-diff
-  #  steps:
-  #    - uses: actions/github-script@v6
-  #      with:
-  #        script: |
-  #          const branch = process.env.GITHUB_REF_NAME;
-  #          const prBranchPattern = new RegExp("^pull-request/[0-9]+$");
-  #          if (!branch.match(prBranchPattern)) {
-  #            throw new Error(`${branch} does not match PR branch pattern.`);
-  #          }
-  #          const summary_url = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
-  #          const prNumber = branch.split("/")[1];
-  #          const summary_comment = `${{ needs.pandas-tests-diff.outputs.job_output }}`;
-  #          github.rest.issues.createComment({
-  #            issue_number: prNumber,
-  #            owner: context.repo.owner,
-  #            repo: context.repo.repo,
-  #            body: `${summary_comment}\n\nHere is [a link to the full test summary](${summary_url}).\n`
-  #          })
+  pandas-tests-diff:
+    # diff the results of running the Pandas unit tests and publish a job summary
+    needs: pandas-tests
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@patch-1
+    with:
+        node_type: cpu4
+        build_type: pull-request
+        run_script: "ci/cudf_pandas_scripts/pandas-tests/diff.sh"
diff --git a/ci/cudf_pandas_scripts/pandas-tests/diff.sh b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
index 37adabdb9c6..ae5a249bcbd 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/diff.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -7,18 +7,31 @@
 # branch and the PR branch:
 
 # Hard-coded needs to match the version deduced by rapids-upload-artifacts-dir
+GH_JOB_NAME="pandas-tests-diff / build"
+rapids-logger "Github job name: ${GH_JOB_NAME}"
+
 MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py310.main-results.json
-PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py310.pr-results.json
-aws s3 cp $MAIN_ARTIFACT main-results.json
+PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py39.pr-results.json
+
+rapids-logger "Fetching latest available results from nightly"
+aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '.main-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt
+cat s3_output.txt
+read -r COMPARE_ENV < s3_output.txt
+export COMPARE_ENV
+rapids-logger "Latest available results from nightly: ${COMPARE_ENV}"
+
+aws s3 cp "s3://rapids-downloads/${COMPARE_ENV}" main-results.json
 aws s3 cp $PR_ARTIFACT pr-results.json
 
 # Compute the diff and prepare job summary:
 python -m pip install pandas tabulate
 python ci/cudf_pandas_scripts/pandas-tests/job-summary.py main-results.json pr-results.json | tee summary.txt >> "$GITHUB_STEP_SUMMARY"
 
-COMMENT=$(head -1 summary.txt)
-
+COMMENT=$(head -1 summary.txt | grep -oP '\d+/\d+ \(\d+\.\d+%\).*?(a decrease by|an increase by) \d+\.\d+%')
 echo "$COMMENT"
-
-# Magic name that the custom-job.yaml workflow reads and re-exports
-echo "job_output=${COMMENT}" >> "${GITHUB_OUTPUT}"
+jq --arg COMMENT "$COMMENT" --arg GH_JOB_NAME "$GH_JOB_NAME" -n \
+  '{"context": "Pandas tests",
+    "description": $COMMENT,
+    "state":"success",
+    "job_name": $GH_JOB_NAME}' \
+    > gh-status.json
diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
index 1e83e51ab04..93a815838b7 100644
--- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
+++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -40,7 +40,7 @@ def get_total_and_passed(results):
     "Merging this PR would result in "
     f"{pr_passed}/{pr_total} ({passing_percentage:.2f}%) "
     "Pandas tests passing, "
-    f"{rate_change_type} in the test pass rate by "
+    f"{rate_change_type} by "
     f"{pass_rate_change:.2f}%. "
     f"Trunk stats: {main_passed}/{main_total}."
 )

From bd249cce41a2475edb8c60525f665695854ae38e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 8 Apr 2024 04:17:08 -1000
Subject: [PATCH 040/272] Remove prior test skipping in run-pandas-tests with
 testing 2.2.1 (#15440)

Now that pandas 2.2.1 is used when running the pandas test suite with `cudf.pandas`, some of the previously skipped tests can now be enabled now that deterministic data is used in the test suite and some tests were refactored.

Also cleaned up some redundant/old configs in this file

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15440
---
 .../cudf/pandas/scripts/run-pandas-tests.sh   | 101 +-----------------
 1 file changed, 3 insertions(+), 98 deletions(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 06df7b36f7d..eeb9f2b6368 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -60,8 +60,6 @@ if [ ! -d "pandas-tests" ]; then
 [tool.pytest.ini_options]
 xfail_strict = true
 filterwarnings = [
-  "error:Sparse:FutureWarning",
-  "error:The SparseArray:FutureWarning",
   # Will be fixed in numba 0.56: https://github.com/numba/numba/issues/7758
   "ignore:`np.MachAr` is deprecated:DeprecationWarning:numba",
 ]
@@ -72,7 +70,7 @@ markers = [
   "db: tests requiring a database (mysql or postgres)",
   "clipboard: mark a pd.read_clipboard test",
   "arm_slow: mark a test as slow for arm64 architecture",
-  "arraymanager: mark a test to run with ArrayManager enabled",
+  "skip_ubsan: Tests known to fail UBSAN check",
 ]
 EOF
     # append the contents of patch-confest.py to conftest.py
@@ -100,104 +98,11 @@ cat ../python/cudf/cudf/pandas/scripts/conftest-patch.py >> pandas-tests/conftes
 # Run the tests
 cd pandas-tests/
 
-# TODO: Get a postgres & mysql container set up on the CI
-# test_overwrite_warns unsafely patchs over Series.mean affecting other tests when run in parallel
-# test_complex_series_frame_alignment randomly selects a DataFrames and axis to test but particular random selection(s) always fails
-# test_numpy_ufuncs_basic compares floating point values to unbounded precision, sometimes leading to failures
-TEST_NUMPY_UFUNCS_BASIC_FLAKY="not test_numpy_ufuncs_basic[float-exp] \
-and not test_numpy_ufuncs_basic[float-exp2] \
-and not test_numpy_ufuncs_basic[float-expm1] \
-and not test_numpy_ufuncs_basic[float-log] \
-and not test_numpy_ufuncs_basic[float-log2] \
-and not test_numpy_ufuncs_basic[float-log10] \
-and not test_numpy_ufuncs_basic[float-log1p] \
-and not test_numpy_ufuncs_basic[float-sqrt] \
-and not test_numpy_ufuncs_basic[float-sin] \
-and not test_numpy_ufuncs_basic[float-cos] \
-and not test_numpy_ufuncs_basic[float-tan] \
-and not test_numpy_ufuncs_basic[float-arcsin] \
-and not test_numpy_ufuncs_basic[float-arccos] \
-and not test_numpy_ufuncs_basic[float-arctan] \
-and not test_numpy_ufuncs_basic[float-sinh] \
-and not test_numpy_ufuncs_basic[float-cosh] \
-and not test_numpy_ufuncs_basic[float-tanh] \
-and not test_numpy_ufuncs_basic[float-arcsinh] \
-and not test_numpy_ufuncs_basic[float-arccosh] \
-and not test_numpy_ufuncs_basic[float-arctanh] \
-and not test_numpy_ufuncs_basic[float-deg2rad] \
-and not test_numpy_ufuncs_basic[float-rad2deg] \
-and not test_numpy_ufuncs_basic[num_float64-exp] \
-and not test_numpy_ufuncs_basic[num_float64-exp2] \
-and not test_numpy_ufuncs_basic[num_float64-expm1] \
-and not test_numpy_ufuncs_basic[num_float64-log] \
-and not test_numpy_ufuncs_basic[num_float64-log2] \
-and not test_numpy_ufuncs_basic[num_float64-log10] \
-and not test_numpy_ufuncs_basic[num_float64-log1p] \
-and not test_numpy_ufuncs_basic[num_float64-sqrt] \
-and not test_numpy_ufuncs_basic[num_float64-sin] \
-and not test_numpy_ufuncs_basic[num_float64-cos] \
-and not test_numpy_ufuncs_basic[num_float64-tan] \
-and not test_numpy_ufuncs_basic[num_float64-arcsin] \
-and not test_numpy_ufuncs_basic[num_float64-arccos] \
-and not test_numpy_ufuncs_basic[num_float64-arctan] \
-and not test_numpy_ufuncs_basic[num_float64-sinh] \
-and not test_numpy_ufuncs_basic[num_float64-cosh] \
-and not test_numpy_ufuncs_basic[num_float64-tanh] \
-and not test_numpy_ufuncs_basic[num_float64-arcsinh] \
-and not test_numpy_ufuncs_basic[num_float64-arccosh] \
-and not test_numpy_ufuncs_basic[num_float64-arctanh] \
-and not test_numpy_ufuncs_basic[num_float64-deg2rad] \
-and not test_numpy_ufuncs_basic[num_float64-rad2deg] \
-and not test_numpy_ufuncs_basic[num_float32-exp] \
-and not test_numpy_ufuncs_basic[num_float32-exp2] \
-and not test_numpy_ufuncs_basic[num_float32-expm1] \
-and not test_numpy_ufuncs_basic[num_float32-log] \
-and not test_numpy_ufuncs_basic[num_float32-log2] \
-and not test_numpy_ufuncs_basic[num_float32-log10] \
-and not test_numpy_ufuncs_basic[num_float32-log1p] \
-and not test_numpy_ufuncs_basic[num_float32-sqrt] \
-and not test_numpy_ufuncs_basic[num_float32-sin] \
-and not test_numpy_ufuncs_basic[num_float32-cos] \
-and not test_numpy_ufuncs_basic[num_float32-tan] \
-and not test_numpy_ufuncs_basic[num_float32-arcsin] \
-and not test_numpy_ufuncs_basic[num_float32-arccos] \
-and not test_numpy_ufuncs_basic[num_float32-arctan] \
-and not test_numpy_ufuncs_basic[num_float32-sinh] \
-and not test_numpy_ufuncs_basic[num_float32-cosh] \
-and not test_numpy_ufuncs_basic[num_float32-tanh] \
-and not test_numpy_ufuncs_basic[num_float32-arcsinh] \
-and not test_numpy_ufuncs_basic[num_float32-arccosh] \
-and not test_numpy_ufuncs_basic[num_float32-arctanh] \
-and not test_numpy_ufuncs_basic[num_float32-deg2rad] \
-and not test_numpy_ufuncs_basic[num_float32-rad2deg] \
-and not test_numpy_ufuncs_basic[nullable_float-exp] \
-and not test_numpy_ufuncs_basic[nullable_float-exp2] \
-and not test_numpy_ufuncs_basic[nullable_float-expm1] \
-and not test_numpy_ufuncs_basic[nullable_float-log] \
-and not test_numpy_ufuncs_basic[nullable_float-log2] \
-and not test_numpy_ufuncs_basic[nullable_float-log10] \
-and not test_numpy_ufuncs_basic[nullable_float-log1p] \
-and not test_numpy_ufuncs_basic[nullable_float-sqrt] \
-and not test_numpy_ufuncs_basic[nullable_float-sin] \
-and not test_numpy_ufuncs_basic[nullable_float-cos] \
-and not test_numpy_ufuncs_basic[nullable_float-tan] \
-and not test_numpy_ufuncs_basic[nullable_float-arcsin] \
-and not test_numpy_ufuncs_basic[nullable_float-arccos] \
-and not test_numpy_ufuncs_basic[nullable_float-arctan] \
-and not test_numpy_ufuncs_basic[nullable_float-sinh] \
-and not test_numpy_ufuncs_basic[nullable_float-cosh] \
-and not test_numpy_ufuncs_basic[nullable_float-tanh] \
-and not test_numpy_ufuncs_basic[nullable_float-arcsinh] \
-and not test_numpy_ufuncs_basic[nullable_float-arccosh] \
-and not test_numpy_ufuncs_basic[nullable_float-arctanh] \
-and not test_numpy_ufuncs_basic[nullable_float-deg2rad] \
-and not test_numpy_ufuncs_basic[nullable_float-rad2deg]"
-
+# TODO: Remove "not db" once a postgres & mysql container is set up on the CI
 PANDAS_CI="1" timeout 30m python -m pytest -p cudf.pandas \
     -v -m "not single_cpu and not db" \
-    -k "not test_overwrite_warns and not test_complex_series_frame_alignment and not test_to_parquet_gcs_new_file and not test_qcut_nat and not test_add and not test_ismethods and $TEST_NUMPY_UFUNCS_BASIC_FLAKY" \
+    -k "not test_to_parquet_gcs_new_file and not test_qcut_nat and not test_add and not test_ismethods" \
     --import-mode=importlib \
-    -o xfail_strict=True \
     ${PYTEST_IGNORES} \
     "$@" || [ $? = 1 ]  # Exit success if exit code was 1 (permit test failures but not other errors)
 

From 3896222052a5aeff8198dca9ab02c053d62ff7c7 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 8 Apr 2024 10:20:09 -0500
Subject: [PATCH 041/272] Patch dask-expr `var` logic in dask-cudf (#15347)

The `var` logic in dask-expr relies on pandas -> numpy conversion that does not work for cudf -> cupy when null values are present. This PR copies over the custom `var` logic being used in dask-cudf for the legacy API.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15347
---
 ci/build_docs.sh                              |   3 -
 python/dask_cudf/dask_cudf/expr/_expr.py      | 106 +++++++++++++++---
 .../dask_cudf/tests/test_reductions.py        |  10 ++
 3 files changed, 103 insertions(+), 16 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index db0109015b8..668d52e530b 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -41,9 +41,6 @@ mkdir -p "${RAPIDS_DOCS_DIR}/libcudf/html"
 mv html/* "${RAPIDS_DOCS_DIR}/libcudf/html"
 popd
 
-# TODO: Remove this once dask-expr works in the 10min notebook
-export DASK_DATAFRAME__QUERY_PLANNING=False
-
 rapids-logger "Build Python docs"
 pushd docs/cudf
 make dirhtml
diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/expr/_expr.py
index 6def6e23b12..ff037b9520c 100644
--- a/python/dask_cudf/dask_cudf/expr/_expr.py
+++ b/python/dask_cudf/dask_cudf/expr/_expr.py
@@ -1,7 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
+import functools
 
 from dask_expr._cumulative import CumulativeBlockwise
-from dask_expr._reductions import Var
+from dask_expr._expr import Expr, VarColumns
+from dask_expr._reductions import Reduction, Var
+
+from dask.dataframe.core import is_dataframe_like, make_meta, meta_nonempty
 
 ##
 ## Custom expression patching
@@ -25,19 +29,95 @@ def _kwargs(self) -> dict:
 CumulativeBlockwise._kwargs = PatchCumulativeBlockwise._kwargs
 
 
-# This patch accounts for differences between
-# numpy and cupy behavior. It may make sense
-# to move this logic upstream.
-_dx_reduction_aggregate = Var.reduction_aggregate
+# The upstream Var code uses `Series.values`, and relies on numpy
+# for most of the logic. Unfortunately, cudf -> cupy conversion
+# is not supported for data containing null values. Therefore,
+# we must implement our own version of Var for now. This logic
+# is mostly copied from dask-cudf.
+
+
+class VarCudf(Reduction):
+    # Uses the parallel version of Welford's online algorithm (Chan '79)
+    # (http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf)
+    _parameters = ["frame", "skipna", "ddof", "numeric_only", "split_every"]
+    _defaults = {
+        "skipna": True,
+        "ddof": 1,
+        "numeric_only": False,
+        "split_every": False,
+    }
+
+    @functools.cached_property
+    def _meta(self):
+        return make_meta(
+            meta_nonempty(self.frame._meta).var(
+                skipna=self.skipna, numeric_only=self.numeric_only
+            )
+        )
+
+    @property
+    def chunk_kwargs(self):
+        return dict(skipna=self.skipna, numeric_only=self.numeric_only)
+
+    @property
+    def combine_kwargs(self):
+        return {}
+
+    @property
+    def aggregate_kwargs(self):
+        return dict(ddof=self.ddof)
+
+    @classmethod
+    def reduction_chunk(cls, x, skipna=True, numeric_only=False):
+        kwargs = {"numeric_only": numeric_only} if is_dataframe_like(x) else {}
+        if skipna or numeric_only:
+            n = x.count(**kwargs)
+            kwargs["skipna"] = skipna
+            avg = x.mean(**kwargs)
+        else:
+            # Not skipping nulls, so might as well
+            # avoid the full `count` operation
+            n = len(x)
+            kwargs["skipna"] = skipna
+            avg = x.sum(**kwargs) / n
+        if numeric_only:
+            # Workaround for cudf bug
+            # (see: https://github.com/rapidsai/cudf/issues/13731)
+            x = x[n.index]
+        m2 = ((x - avg) ** 2).sum(**kwargs)
+        return n, avg, m2
+
+    @classmethod
+    def reduction_combine(cls, parts):
+        n, avg, m2 = parts[0]
+        for i in range(1, len(parts)):
+            n_a, avg_a, m2_a = n, avg, m2
+            n_b, avg_b, m2_b = parts[i]
+            n = n_a + n_b
+            avg = (n_a * avg_a + n_b * avg_b) / n
+            delta = avg_b - avg_a
+            m2 = m2_a + m2_b + delta**2 * n_a * n_b / n
+        return n, avg, m2
+
+    @classmethod
+    def reduction_aggregate(cls, vals, ddof=1):
+        vals = cls.reduction_combine(vals)
+        n, _, m2 = vals
+        return m2 / (n - ddof)
 
 
-def _reduction_aggregate(*args, **kwargs):
-    result = _dx_reduction_aggregate(*args, **kwargs)
-    if result.ndim == 0:
-        # cupy will sometimes produce a 0d array, and
-        # we need to convert it to a scalar.
-        return result.item()
-    return result
+def _patched_var(
+    self, axis=0, skipna=True, ddof=1, numeric_only=False, split_every=False
+):
+    if axis == 0:
+        if hasattr(self._meta, "to_pandas"):
+            return VarCudf(self, skipna, ddof, numeric_only, split_every)
+        else:
+            return Var(self, skipna, ddof, numeric_only, split_every)
+    elif axis == 1:
+        return VarColumns(self, skipna, ddof, numeric_only)
+    else:
+        raise ValueError(f"axis={axis} not supported. Please specify 0 or 1")
 
 
-Var.reduction_aggregate = staticmethod(_reduction_aggregate)
+Expr.var = _patched_var
diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py
index c3056f2607c..88b15718382 100644
--- a/python/dask_cudf/dask_cudf/tests/test_reductions.py
+++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py
@@ -84,3 +84,13 @@ def test_rowwise_reductions(data, op):
             check_exact=False,
             check_dtype=op not in ("var", "std"),
         )
+
+
+@pytest.mark.parametrize("skipna", [True, False])
+def test_var_nulls(skipna):
+    # Copied from 10min example notebook
+    # See: https://github.com/rapidsai/cudf/pull/15347
+    s = cudf.Series([1, 2, 3, None, 4])
+    ds = dask_cudf.from_cudf(s, npartitions=2)
+    dd.assert_eq(s.var(skipna=skipna), ds.var(skipna=skipna))
+    dd.assert_eq(s.std(skipna=skipna), ds.std(skipna=skipna))

From 7750afc81f02089faa66289e96dcfb8ecb3623bd Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 8 Apr 2024 11:46:10 -0400
Subject: [PATCH 042/272] Remove deprecated strings offsets_begin (#15454)

Removes the deprecated `cudf::strings_column_view::offsets_begin()` and `cudf::strings_column_view::offsets_end()` member functions. These are replaced with offsetalator wrapper calls instead.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/15454
---
 .../cudf/strings/strings_column_view.hpp      | 22 -------------------
 cpp/src/strings/strings_column_view.cpp       | 10 ---------
 2 files changed, 32 deletions(-)

diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp
index 1156f0a5b73..1e9e73cef4c 100644
--- a/cpp/include/cudf/strings/strings_column_view.hpp
+++ b/cpp/include/cudf/strings/strings_column_view.hpp
@@ -85,28 +85,6 @@ class strings_column_view : private column_view {
    */
   [[nodiscard]] column_view offsets() const;
 
-  /**
-   * @brief Return an iterator for the offsets child column.
-   *
-   * @deprecated Since 24.04
-   *
-   * This automatically applies the offset of the parent.
-   *
-   * @return Iterator pointing to the first offset value.
-   */
-  [[deprecated]] offset_iterator offsets_begin() const;
-
-  /**
-   * @brief Return an end iterator for the offsets child column.
-   *
-   * @deprecated Since 24.04
-   *
-   * This automatically applies the offset of the parent.
-   *
-   * @return Iterator pointing 1 past the last offset value.
-   */
-  [[deprecated]] offset_iterator offsets_end() const;
-
   /**
    * @brief Returns the number of bytes in the chars child column.
    *
diff --git a/cpp/src/strings/strings_column_view.cpp b/cpp/src/strings/strings_column_view.cpp
index 3ae97a00bbf..32671669093 100644
--- a/cpp/src/strings/strings_column_view.cpp
+++ b/cpp/src/strings/strings_column_view.cpp
@@ -35,16 +35,6 @@ column_view strings_column_view::offsets() const
   return child(offsets_column_index);
 }
 
-strings_column_view::offset_iterator strings_column_view::offsets_begin() const
-{
-  return offsets().begin<int32_t>() + offset();
-}
-
-strings_column_view::offset_iterator strings_column_view::offsets_end() const
-{
-  return offsets().begin<int32_t>() + offset() + size() + 1;
-}
-
 int64_t strings_column_view::chars_size(rmm::cuda_stream_view stream) const noexcept
 {
   if (size() == 0) { return 0L; }

From 44e0640bed93a5915346e38ff5380e2eef9a1e27 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 8 Apr 2024 10:58:39 -0500
Subject: [PATCH 043/272] Avoid "p2p" shuffle as a default when `dask_cudf` is
 imported (#15469)

I was looking through some dask-related test failures in https://github.com/rapidsai/cuml/pull/5819 and noticed that the "p2p" shuffle is causing some problems when query-planning is enabled. This PR sets the global default to "tasks". It *may* make sense to roll back this change once we fix the underlying problem(s), but I doubt it.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15469
---
 python/dask_cudf/dask_cudf/expr/__init__.py   |  3 +++
 .../dask_cudf/tests/test_distributed.py       | 25 ++++++++++++++++---
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/expr/__init__.py b/python/dask_cudf/dask_cudf/expr/__init__.py
index 826f514a674..a76b655ef42 100644
--- a/python/dask_cudf/dask_cudf/expr/__init__.py
+++ b/python/dask_cudf/dask_cudf/expr/__init__.py
@@ -8,6 +8,9 @@
 
 # Register custom expressions and collections
 if QUERY_PLANNING_ON:
+    # Broadly avoid "p2p" and "disk" defaults for now
+    config.set({"dataframe.shuffle.method": "tasks"})
+
     try:
         import dask_cudf.expr._collection
         import dask_cudf.expr._expr
diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py
index 39eadb45c91..07fdb25dff9 100644
--- a/python/dask_cudf/dask_cudf/tests/test_distributed.py
+++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py
@@ -16,9 +16,9 @@
 dask_cuda = pytest.importorskip("dask_cuda")
 
 
-def more_than_two_gpus():
+def at_least_n_gpus(n):
     ngpus = len(numba.cuda.gpus)
-    return ngpus >= 2
+    return ngpus >= n
 
 
 @pytest.mark.parametrize("delayed", [True, False])
@@ -54,7 +54,7 @@ def test_merge():
 
 
 @pytest.mark.skipif(
-    not more_than_two_gpus(), reason="Machine does not have more than two GPUs"
+    not at_least_n_gpus(2), reason="Machine does not have two GPUs"
 )
 def test_ucx_seriesgroupby():
     pytest.importorskip("ucp")
@@ -97,3 +97,22 @@ def test_p2p_shuffle():
                 ddf.compute().sort_values("x"),
                 check_index=False,
             )
+
+
+@pytest.mark.skipif(
+    not at_least_n_gpus(3),
+    reason="Machine does not have three GPUs",
+)
+def test_unique():
+    # Using `"p2p"` can produce dispatching problems
+    # TODO: Test "p2p" after dask > 2024.4.1 is required
+    # See: https://github.com/dask/dask/pull/11040
+    with dask_cuda.LocalCUDACluster(n_workers=3) as cluster:
+        with Client(cluster):
+            df = cudf.DataFrame({"x": ["a", "b", "c", "a", "a"]})
+            ddf = dask_cudf.from_cudf(df, npartitions=2)
+            dd.assert_eq(
+                df.x.unique(),
+                ddf.x.unique().compute(),
+                check_index=False,
+            )

From 6b3fd6a77e329f4e1db12ac2c0c9d1ad653cee98 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 8 Apr 2024 06:27:04 -1000
Subject: [PATCH 044/272] Enable tests/interchange/test_impl.py in cudf.pandas
 tests (#15443)

closes #15423

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15443
---
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index eeb9f2b6368..1ba2ac39ab2 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -23,7 +23,6 @@ set -euo pipefail
 PANDAS_VERSION=$(python -c "import pandas; print(pandas.__version__)")
 
 PYTEST_IGNORES="--ignore=tests/io/test_user_agent.py \
---ignore=tests/interchange/test_impl.py \
 --ignore=tests/window/test_dtypes.py \
 --ignore=tests/strings/test_api.py \
 --ignore=tests/window/test_numba.py \

From 33771bb935a1863f9ee8e0b62cdaed995725903c Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 8 Apr 2024 11:18:29 -1000
Subject: [PATCH 045/272] Enable tests/io/test_user_agent.py in cudf pandas
 tests (#15442)

This test was renamed in `test_https_headers.py` in 2.2.0. https://github.com/pandas-dev/pandas/pull/56057

Also this test now "runs" but is skipped because it's marked as a `single_cpu` test which we skip when running these tests.

closes https://github.com/rapidsai/cudf/issues/15422

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15442
---
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 1ba2ac39ab2..9eb77358c41 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -22,7 +22,7 @@ set -euo pipefail
 # of Pandas installed.
 PANDAS_VERSION=$(python -c "import pandas; print(pandas.__version__)")
 
-PYTEST_IGNORES="--ignore=tests/io/test_user_agent.py \
+PYTEST_IGNORES="--ignore=tests/interchange/test_impl.py \
 --ignore=tests/window/test_dtypes.py \
 --ignore=tests/strings/test_api.py \
 --ignore=tests/window/test_numba.py \

From 2d73f11c00597294519535f2668a67e6b710de1c Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 8 Apr 2024 11:23:15 -1000
Subject: [PATCH 046/272] Enable tests/strings/test_api.py and
 tests/io/pytables in cudf.pandas tests (#15461)

closes https://github.com/rapidsai/cudf/issues/15425
closes https://github.com/rapidsai/cudf/issues/15427

The `tests/io/pytables` tests are technically skipped since they are marked `single_cpu` and we run `-m not single_cpu`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15461
---
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 9eb77358c41..b549f87230a 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -24,10 +24,8 @@ PANDAS_VERSION=$(python -c "import pandas; print(pandas.__version__)")
 
 PYTEST_IGNORES="--ignore=tests/interchange/test_impl.py \
 --ignore=tests/window/test_dtypes.py \
---ignore=tests/strings/test_api.py \
 --ignore=tests/window/test_numba.py \
 --ignore=tests/window \
---ignore=tests/io/pytables \
 --ignore=tests/plotting \
 --ignore=tests/scalar \
 --ignore=tests/series/test_arithmetic.py \

From b037ddfb04d8b69214f0847ffe5905048c522511 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 8 Apr 2024 12:26:45 -1000
Subject: [PATCH 047/272] Ignore pandas tests for cudf.pandas that need
 motoserver (#15468)

These test `ERROR` because they expect a connection to a mock S3 server. Ignoring these test for now until that is set up

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15468
---
 .../cudf/pandas/scripts/run-pandas-tests.sh   | 43 ++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index b549f87230a..0ccec2663cb 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -95,10 +95,51 @@ cat ../python/cudf/cudf/pandas/scripts/conftest-patch.py >> pandas-tests/conftes
 # Run the tests
 cd pandas-tests/
 
+
+# TODO: Needs motoserver/moto container running on http://localhost:5000
+TEST_THAT_NEED_MOTO_SERVER="not test_styler_to_s3 \
+and not test_with_s3_url[None] \
+and not test_with_s3_url[gzip] \
+and not test_with_s3_url[bz2] \
+and not test_with_s3_url[zip] \
+and not test_with_s3_url[xz] \
+and not test_with_s3_url[tar] \
+and not test_s3_permission_output[etree] \
+and not test_read_s3_jsonl \
+and not test_s3_parser_consistency \
+and not test_to_s3 \
+and not test_parse_public_s3a_bucket \
+and not test_parse_public_s3_bucket_nrows \
+and not test_parse_public_s3_bucket_chunked \
+and not test_parse_public_s3_bucket_chunked_python \
+and not test_parse_public_s3_bucket_python \
+and not test_infer_s3_compression \
+and not test_parse_public_s3_bucket_nrows_python \
+and not test_read_s3_fails_private \
+and not test_read_csv_handles_boto_s3_object \
+and not test_read_csv_chunked_download \
+and not test_read_s3_with_hash_in_key \
+and not test_read_feather_s3_file_path \
+and not test_parse_public_s3_bucket \
+and not test_parse_private_s3_bucket \
+and not test_parse_public_s3n_bucket \
+and not test_read_with_creds_from_pub_bucket \
+and not test_read_without_creds_from_pub_bucket \
+and not test_from_s3_csv \
+and not test_s3_protocols[s3] \
+and not test_s3_protocols[s3a] \
+and not test_s3_protocols[s3n] \
+and not test_s3_parquet \
+and not test_s3_roundtrip_explicit_fs \
+and not test_s3_roundtrip \
+and not test_s3_roundtrip_for_dir[partition_col0] \
+and not test_s3_roundtrip_for_dir[partition_col1] \
+and not test_s3_roundtrip"
+
 # TODO: Remove "not db" once a postgres & mysql container is set up on the CI
 PANDAS_CI="1" timeout 30m python -m pytest -p cudf.pandas \
     -v -m "not single_cpu and not db" \
-    -k "not test_to_parquet_gcs_new_file and not test_qcut_nat and not test_add and not test_ismethods" \
+    -k "not test_to_parquet_gcs_new_file and not test_qcut_nat and not test_add and not test_ismethods and $TEST_THAT_NEED_MOTO_SERVER" \
     --import-mode=importlib \
     ${PYTEST_IGNORES} \
     "$@" || [ $? = 1 ]  # Exit success if exit code was 1 (permit test failures but not other errors)

From 1862cdc089c3a77ccec70411e5cd6dac292a8029 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Mon, 8 Apr 2024 20:50:37 -0400
Subject: [PATCH 048/272] Introduce benchmark suite for JSON reader options
 (#15124)

The goal of this piece of work is to analyze the performance of the reader for JSON lines. This PR establishes a baseline for the performance of single quote normalization, white space normalization, mixed type as string parsing and recovery mode options when the input JSON is valid, and does not have any single quotes.
Modifying the data generation to produce inputs with single quotes/mixed types/invalid lines will be the focus of follow-on PRs.
Addresses #15041

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15124
---
 cpp/benchmarks/CMakeLists.txt                 |   1 +
 cpp/benchmarks/io/json/json_reader_option.cpp | 197 ++++++++++++++++++
 cpp/benchmarks/io/nvbench_helpers.hpp         |  67 +++++-
 3 files changed, 264 insertions(+), 1 deletion(-)
 create mode 100644 cpp/benchmarks/io/json/json_reader_option.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 798e4e76141..b384f6d5674 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -331,6 +331,7 @@ ConfigureNVBench(
 ConfigureBench(JSON_BENCH json/json.cu)
 ConfigureNVBench(FST_NVBENCH io/fst.cu)
 ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader_input.cpp)
+ConfigureNVBench(JSON_READER_OPTION io/json/json_reader_option.cpp)
 ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/io/json/json_reader_option.cpp b/cpp/benchmarks/io/json/json_reader_option.cpp
new file mode 100644
index 00000000000..ed1008d053a
--- /dev/null
+++ b/cpp/benchmarks/io/json/json_reader_option.cpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/io/cuio_common.hpp>
+#include <benchmarks/io/nvbench_helpers.hpp>
+
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/io/json.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
+constexpr size_t data_size         = 512 << 20;
+constexpr cudf::size_type num_cols = 64;
+
+template <json_lines JsonLines>
+void BM_json_read_options(nvbench::state& state, nvbench::type_list<nvbench::enum_type<JsonLines>>)
+{
+  constexpr auto json_lines_bool = JsonLines == json_lines::YES;
+
+  cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
+  auto const data_types = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
+                                             static_cast<int32_t>(data_type::FLOAT),
+                                             static_cast<int32_t>(data_type::DECIMAL),
+                                             static_cast<int32_t>(data_type::STRING),
+                                             static_cast<int32_t>(data_type::LIST),
+                                             static_cast<int32_t>(data_type::STRUCT)});
+
+  auto const tbl = create_random_table(
+    cycle_dtypes(data_types, num_cols), table_size_bytes{data_size}, data_profile_builder());
+  auto const view = tbl->view();
+  cudf::io::json_writer_options const write_opts =
+    cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view)
+      .lines(json_lines_bool)
+      .na_rep("null")
+      .rows_per_chunk(100'000);
+  cudf::io::write_json(write_opts);
+
+  cudf::io::json_reader_options read_options =
+    cudf::io::json_reader_options::builder(source_sink.make_source_info()).lines(json_lines_bool);
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(
+    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+      try_drop_l3_cache();
+      timer.start();
+      auto const result        = cudf::io::read_json(read_options);
+      auto const num_rows_read = result.tbl->num_rows();
+      auto const num_cols_read = result.tbl->num_columns();
+      timer.stop();
+      CUDF_EXPECTS(num_rows_read == view.num_rows(), "Benchmark did not read the entire table");
+      CUDF_EXPECTS(num_cols_read == num_cols, "Unexpected number of columns");
+    });
+
+  auto const elapsed_time   = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  auto const data_processed = data_size * num_cols / view.num_columns();
+  state.add_element_count(static_cast<double>(data_processed) / elapsed_time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
+}
+
+template <row_selection RowSelection,
+          normalize_single_quotes NormalizeSingleQuotes,
+          normalize_whitespace NormalizeWhitespace,
+          mixed_types_as_string MixedTypesAsString,
+          recovery_mode RecoveryMode>
+void BM_jsonlines_read_options(nvbench::state& state,
+                               nvbench::type_list<nvbench::enum_type<RowSelection>,
+                                                  nvbench::enum_type<NormalizeSingleQuotes>,
+                                                  nvbench::enum_type<NormalizeWhitespace>,
+                                                  nvbench::enum_type<MixedTypesAsString>,
+                                                  nvbench::enum_type<RecoveryMode>>)
+{
+  constexpr auto normalize_single_quotes_bool =
+    NormalizeSingleQuotes == normalize_single_quotes::YES;
+  constexpr auto normalize_whitespace_bool  = NormalizeWhitespace == normalize_whitespace::YES;
+  constexpr auto mixed_types_as_string_bool = MixedTypesAsString == mixed_types_as_string::YES;
+  constexpr auto recovery_mode_enum         = RecoveryMode == recovery_mode::RECOVER_WITH_NULL
+                                                ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL
+                                                : cudf::io::json_recovery_mode_t::FAIL;
+  size_t const num_chunks                   = state.get_int64("num_chunks");
+  if (num_chunks > 1 && RowSelection == row_selection::ALL) {
+    state.skip(
+      "No point running the same benchmark multiple times for different num_chunks when all rows "
+      "are being selected anyway");
+    return;
+  }
+
+  cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
+  auto const data_types = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
+                                             static_cast<int32_t>(data_type::FLOAT),
+                                             static_cast<int32_t>(data_type::DECIMAL),
+                                             static_cast<int32_t>(data_type::STRING),
+                                             static_cast<int32_t>(data_type::LIST),
+                                             static_cast<int32_t>(data_type::STRUCT)});
+
+  auto const tbl = create_random_table(
+    cycle_dtypes(data_types, num_cols), table_size_bytes{data_size}, data_profile_builder());
+  auto const view = tbl->view();
+  cudf::io::json_writer_options const write_opts =
+    cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view)
+      .lines(true)
+      .na_rep("null")
+      .rows_per_chunk(100'000);
+  cudf::io::write_json(write_opts);
+
+  cudf::io::json_reader_options read_options =
+    cudf::io::json_reader_options::builder(source_sink.make_source_info())
+      .lines(true)
+      .normalize_single_quotes(normalize_single_quotes_bool)
+      .normalize_whitespace(normalize_whitespace_bool)
+      .mixed_types_as_string(mixed_types_as_string_bool)
+      .recovery_mode(recovery_mode_enum);
+
+  size_t const chunk_size = cudf::util::div_rounding_up_safe(source_sink.size(), num_chunks);
+  auto mem_stats_logger   = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(
+    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+      try_drop_l3_cache();
+      cudf::size_type num_rows_read = 0;
+      cudf::size_type num_cols_read = 0;
+      timer.start();
+      switch (RowSelection) {
+        case row_selection::ALL: {
+          auto const result = cudf::io::read_json(read_options);
+          num_rows_read     = result.tbl->num_rows();
+          num_cols_read     = result.tbl->num_columns();
+          break;
+        }
+        case row_selection::BYTE_RANGE: {
+          for (uint64_t chunk = 0; chunk < num_chunks; chunk++) {
+            read_options.set_byte_range_offset(chunk * chunk_size);
+            read_options.set_byte_range_size(chunk_size);
+            auto const result = cudf::io::read_json(read_options);
+            num_rows_read += result.tbl->num_rows();
+            num_cols_read = result.tbl->num_columns();
+            if (num_cols_read)
+              CUDF_EXPECTS(num_cols_read == num_cols, "Unexpected number of columns");
+          }
+          break;
+        }
+        default: CUDF_FAIL("Unsupported row selection method");
+      }
+      timer.stop();
+      CUDF_EXPECTS(num_rows_read == view.num_rows(), "Benchmark did not read the entire table");
+    });
+
+  auto const elapsed_time   = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  auto const data_processed = data_size * num_cols / view.num_columns();
+  state.add_element_count(static_cast<double>(data_processed) / elapsed_time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
+}
+
+NVBENCH_BENCH_TYPES(
+  BM_jsonlines_read_options,
+  NVBENCH_TYPE_AXES(
+    nvbench::enum_type_list<row_selection::ALL, row_selection::BYTE_RANGE>,
+    nvbench::enum_type_list<normalize_single_quotes::NO, normalize_single_quotes::YES>,
+    nvbench::enum_type_list<normalize_whitespace::NO, normalize_whitespace::YES>,
+    nvbench::enum_type_list<mixed_types_as_string::NO, mixed_types_as_string::YES>,
+    nvbench::enum_type_list<recovery_mode::RECOVER_WITH_NULL, recovery_mode::FAIL>))
+  .set_name("jsonlines_reader")
+  .set_type_axes_names({"row_selection",
+                        "normalize_single_quotes",
+                        "normalize_whitespace",
+                        "mixed_types_as_string",
+                        "recovery_mode"})
+  .set_min_samples(6)
+  .add_int64_axis("num_chunks", nvbench::range(1, 5, 1));
+
+NVBENCH_BENCH_TYPES(BM_json_read_options,
+                    NVBENCH_TYPE_AXES(nvbench::enum_type_list<json_lines::YES, json_lines::NO>))
+  .set_name("json_reader")
+  .set_type_axes_names({"json_lines"})
+  .set_min_samples(6);
diff --git a/cpp/benchmarks/io/nvbench_helpers.hpp b/cpp/benchmarks/io/nvbench_helpers.hpp
index dd96f6fa4cd..8b79912c7ee 100644
--- a/cpp/benchmarks/io/nvbench_helpers.hpp
+++ b/cpp/benchmarks/io/nvbench_helpers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -169,3 +169,68 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
     }
   },
   [](auto) { return std::string{}; })
+
+enum class json_lines : bool { YES, NO };
+
+enum class normalize_single_quotes : bool { YES, NO };
+
+enum class normalize_whitespace : bool { YES, NO };
+
+enum class mixed_types_as_string : bool { YES, NO };
+
+enum class recovery_mode : bool { FAIL, RECOVER_WITH_NULL };
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  json_lines,
+  [](auto value) {
+    switch (value) {
+      case json_lines::YES: return "YES";
+      case json_lines::NO: return "NO";
+      default: return "Unknown";
+    }
+  },
+  [](auto) { return std::string{}; })
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  normalize_single_quotes,
+  [](auto value) {
+    switch (value) {
+      case normalize_single_quotes::YES: return "YES";
+      case normalize_single_quotes::NO: return "NO";
+      default: return "Unknown";
+    }
+  },
+  [](auto) { return std::string{}; })
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  normalize_whitespace,
+  [](auto value) {
+    switch (value) {
+      case normalize_whitespace::YES: return "YES";
+      case normalize_whitespace::NO: return "NO";
+      default: return "Unknown";
+    }
+  },
+  [](auto) { return std::string{}; })
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  mixed_types_as_string,
+  [](auto value) {
+    switch (value) {
+      case mixed_types_as_string::YES: return "YES";
+      case mixed_types_as_string::NO: return "NO";
+      default: return "Unknown";
+    }
+  },
+  [](auto) { return std::string{}; })
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  recovery_mode,
+  [](auto value) {
+    switch (value) {
+      case recovery_mode::FAIL: return "FAIL";
+      case recovery_mode::RECOVER_WITH_NULL: return "RECOVER_WITH_NULL";
+      default: return "Unknown";
+    }
+  },
+  [](auto) { return std::string{}; })

From f1a3db28e1e5efe9f144f95a7392549ea2c221b1 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 9 Apr 2024 02:46:13 -1000
Subject: [PATCH 049/272] Enable tests/scalar and test/series in cudf.pandas
 tests (#15486)

Locally these don't seem to hang or crash workers

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15486
---
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 0ccec2663cb..f14490eee7d 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -27,8 +27,6 @@ PYTEST_IGNORES="--ignore=tests/interchange/test_impl.py \
 --ignore=tests/window/test_numba.py \
 --ignore=tests/window \
 --ignore=tests/plotting \
---ignore=tests/scalar \
---ignore=tests/series/test_arithmetic.py \
 --ignore=tests/tslibs/test_parsing.py \
 --ignore=tests/io/parser/common/test_read_errors.py"
 

From 54eff4ef3a6cb2e0e10f1064eb2071653a3c9bc8 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 9 Apr 2024 04:19:47 -1000
Subject: [PATCH 050/272] Avoid .ordered and .categories from being settable in
 CategoricalColumn and CategoricalDtype (#15475)

A rehash of https://github.com/rapidsai/cudf/pull/14979

The `CategoricalDtype.ordered` behavior matches `pandas.CategoricalDtype.ordered` behavior.

Also combines `as_ordered` and `as_unordred` into 1 method, and avoids to `as_index` casts that are already performed elsewhere

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15475
---
 python/cudf/cudf/core/column/categorical.py | 53 +++++++--------------
 python/cudf/cudf/core/dtypes.py             |  4 --
 python/cudf/cudf/core/index.py              |  6 +--
 python/cudf/cudf/tests/test_categorical.py  |  5 ++
 4 files changed, 26 insertions(+), 42 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 88bb4521a5b..e4620ee5bc4 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -110,7 +110,7 @@ def categories(self) -> "cudf.core.index.Index":
         """
         The categories of this categorical.
         """
-        return cudf.core.index.as_index(self._column.categories)
+        return self._column.dtype.categories
 
     @property
     def codes(self) -> "cudf.Series":
@@ -165,7 +165,7 @@ def as_ordered(self) -> Optional[SeriesOrIndex]:
         dtype: category
         Categories (3, int64): [1 < 2 < 10]
         """
-        return self._return_or_inplace(self._column.as_ordered())
+        return self._return_or_inplace(self._column.as_ordered(ordered=True))
 
     def as_unordered(self) -> Optional[SeriesOrIndex]:
         """
@@ -212,8 +212,7 @@ def as_unordered(self) -> Optional[SeriesOrIndex]:
         dtype: category
         Categories (3, int64): [1, 2, 10]
         """
-
-        return self._return_or_inplace(self._column.as_unordered())
+        return self._return_or_inplace(self._column.as_ordered(ordered=False))
 
     def add_categories(self, new_categories: Any) -> Optional[SeriesOrIndex]:
         """
@@ -631,10 +630,6 @@ def codes(self) -> NumericalColumn:
     def ordered(self) -> bool:
         return self.dtype.ordered
 
-    @ordered.setter
-    def ordered(self, value: bool):
-        self.dtype.ordered = value
-
     def __setitem__(self, key, value):
         if cudf.api.types.is_scalar(
             value
@@ -1170,9 +1165,11 @@ def _get_decategorized_column(self) -> ColumnBase:
     def copy(self, deep: bool = True) -> Self:
         result_col = super().copy(deep=deep)
         if deep:
-            result_col.categories = libcudf.copying.copy_column(
-                self.dtype._categories
+            dtype_copy = CategoricalDtype(
+                categories=self.categories.copy(),
+                ordered=self.ordered,
             )
+            result_col = cast(Self, result_col._with_type_metadata(dtype_copy))
         return result_col
 
     @cached_property
@@ -1411,31 +1408,17 @@ def reorder_categories(
             )
         return self._set_categories(new_categories, ordered=ordered)
 
-    def as_ordered(self):
-        out_col = self
-        if not out_col.ordered:
-            out_col = column.build_categorical_column(
-                categories=self.categories,
-                codes=self.codes,
-                mask=self.base_mask,
-                size=self.base_size,
-                offset=self.offset,
-                ordered=True,
-            )
-        return out_col
-
-    def as_unordered(self):
-        out_col = self
-        if out_col.ordered:
-            out_col = column.build_categorical_column(
-                categories=self.categories,
-                codes=self.codes,
-                mask=self.base_mask,
-                size=self.base_size,
-                offset=self.offset,
-                ordered=False,
-            )
-        return out_col
+    def as_ordered(self, ordered: bool):
+        if self.dtype.ordered == ordered:
+            return self
+        return column.build_categorical_column(
+            categories=self.categories,
+            codes=self.codes,
+            mask=self.base_mask,
+            size=self.base_size,
+            offset=self.offset,
+            ordered=ordered,
+        )
 
 
 def _create_empty_categorical_column(
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 3bd342e24c2..73617763221 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -205,10 +205,6 @@ def ordered(self) -> bool:
         """
         return self._ordered
 
-    @ordered.setter
-    def ordered(self, value) -> None:
-        self._ordered = value
-
     @classmethod
     def from_pandas(cls, dtype: pd.CategoricalDtype) -> "CategoricalDtype":
         """
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index bd9dc1ae3da..0a7435bd241 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2624,9 +2624,9 @@ def __init__(
         elif isinstance(dtype, (pd.CategoricalDtype, cudf.CategoricalDtype)):
             data = data.set_categories(dtype.categories, ordered=ordered)
         elif ordered is True and data.ordered is False:
-            data = data.as_ordered()
+            data = data.as_ordered(ordered=True)
         elif ordered is False and data.ordered is True:
-            data = data.as_unordered()
+            data = data.as_ordered(ordered=False)
         super().__init__(data, **kwargs)
 
     @property  # type: ignore
@@ -2643,7 +2643,7 @@ def categories(self):
         """
         The categories of this categorical.
         """
-        return as_index(self._values.categories)
+        return self.dtype.categories
 
     def _is_boolean(self):
         return False
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index cc3e20b5bac..e21fd53bee4 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -848,6 +848,11 @@ def test_empty_series_category_cast(ordered):
     assert_eq(expected.dtype.ordered, actual.dtype.ordered)
 
 
+def test_categorical_dtype_ordered_not_settable():
+    with pytest.raises(AttributeError):
+        cudf.CategoricalDtype().ordered = False
+
+
 @pytest.mark.parametrize("scalar", [1, "a", None, 10.2])
 def test_cat_from_scalar(scalar):
     ps = pd.Series(scalar, dtype="category")

From 338cc98ff08fcfe8ab1a47a4db5373c7bce74538 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Tue, 9 Apr 2024 10:52:50 -0400
Subject: [PATCH 051/272] `ModuleAccelerator` performance: cache the result of
 checking if a caller is in the denylist (#15056)

Authors:
  - Ashwin Srinath (https://github.com/shwina)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15056
---
 python/cudf/cudf/pandas/module_accelerator.py | 24 ++++++++++++-------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py
index e97d6e4af24..1d431c6d882 100644
--- a/python/cudf/cudf/pandas/module_accelerator.py
+++ b/python/cudf/cudf/pandas/module_accelerator.py
@@ -17,7 +17,7 @@
 from abc import abstractmethod
 from importlib._bootstrap import _ImportLockContext as ImportLock
 from types import ModuleType
-from typing import Any, ContextManager, Dict, List, NamedTuple
+from typing import Any, ContextManager, Dict, NamedTuple, Tuple
 
 from typing_extensions import Self
 
@@ -377,7 +377,7 @@ class ModuleAccelerator(ModuleAcceleratorBase):
     attempts to call the fast version first).
     """
 
-    _denylist: List[str]
+    _denylist: Tuple[str]
     _use_fast_lib: bool
     _use_fast_lib_lock: threading.RLock
     _module_cache_prefix: str = "_slow_lib_"
@@ -407,7 +407,7 @@ def __new__(
             if mod.startswith(self.slow_lib):
                 sys.modules[self._module_cache_prefix + mod] = sys.modules[mod]
                 del sys.modules[mod]
-        self._denylist = [*slow_module.__path__, *fast_module.__path__]
+        self._denylist = (*slow_module.__path__, *fast_module.__path__)
 
         # Lock to manage temporarily disabling delivering wrapped attributes
         self._use_fast_lib_lock = threading.RLock()
@@ -551,17 +551,13 @@ def getattr_real_or_wrapped(
             # release the lock after reading this value)
             use_real = not loader._use_fast_lib
         if not use_real:
-            CUDF_PANDAS_PATH = __file__.rsplit("/", 1)[0]
             # Only need to check the denylist if we're not turned off.
             frame = sys._getframe()
             # We cannot possibly be at the top level.
             assert frame.f_back
             calling_module = pathlib.PurePath(frame.f_back.f_code.co_filename)
-            use_real = not calling_module.is_relative_to(
-                CUDF_PANDAS_PATH
-            ) and any(
-                calling_module.is_relative_to(path)
-                for path in loader._denylist
+            use_real = _caller_in_denylist(
+                calling_module, tuple(loader._denylist)
             )
         try:
             if use_real:
@@ -623,3 +619,13 @@ def disable_module_accelerator() -> contextlib.ExitStack:
                 stack.enter_context(finder.disabled())
         return stack.pop_all()
     assert False  # pacify type checker
+
+
+# because this function gets called so often and is quite
+# expensive to run, we cache the results:
+@functools.lru_cache(maxsize=1024)
+def _caller_in_denylist(calling_module, denylist):
+    CUDF_PANDAS_PATH = __file__.rsplit("/", 1)[0]
+    return not calling_module.is_relative_to(CUDF_PANDAS_PATH) and any(
+        calling_module.is_relative_to(path) for path in denylist
+    )

From 67246587438241ececa23661e6d7966bab1abdcc Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 9 Apr 2024 04:56:06 -1000
Subject: [PATCH 052/272] Enable tests/windows/ in cudf.pandas tests (#15444)

closes #15424
closes #15426

In the test suite, the `window/numba` tests are skipped since they are marked `single_cpu` and these test are run with `-m not single_cpu`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15444
---
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index f14490eee7d..2f6c4ac5b13 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -22,11 +22,7 @@ set -euo pipefail
 # of Pandas installed.
 PANDAS_VERSION=$(python -c "import pandas; print(pandas.__version__)")
 
-PYTEST_IGNORES="--ignore=tests/interchange/test_impl.py \
---ignore=tests/window/test_dtypes.py \
---ignore=tests/window/test_numba.py \
---ignore=tests/window \
---ignore=tests/plotting \
+PYTEST_IGNORES="--ignore=tests/plotting \
 --ignore=tests/tslibs/test_parsing.py \
 --ignore=tests/io/parser/common/test_read_errors.py"
 

From a2f625ac7eb1bba5f8a21b48dd268334a53c572f Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 9 Apr 2024 16:46:05 +0100
Subject: [PATCH 053/272] Only use data_type constructor with scale for decimal
 types (#15472)

If we pass a scale parameter to cudf::data_type, the type_id must name a decimal type. This is asserted in debug mode.

Without this change, one cannot use the cython wrappers when build with CMAKE_BUILD_TYPE=Debug.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15472
---
 python/cudf/cudf/_lib/pylibcudf/types.pyx | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx
index d8b92283412..baf92223714 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx
@@ -28,7 +28,14 @@ cdef class DataType:
         The scale associated with the data. Only used for decimal data types.
     """
     def __cinit__(self, type_id id, int32_t scale=0):
-        self.c_obj = data_type(id, scale)
+        if (
+            id == type_id.DECIMAL32
+            or id == type_id.DECIMAL64
+            or id == type_id.DECIMAL128
+        ):
+            self.c_obj = data_type(id, scale)
+        else:
+            self.c_obj = data_type(id)
 
     # TODO: Consider making both id and scale cached properties.
     cpdef type_id id(self):

From 72b2759b3987baa8fd3b07fab2ef5c7942d057aa Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Tue, 9 Apr 2024 10:58:21 -0500
Subject: [PATCH 054/272] Support orc and text IO with dask-expr using legacy
 conversion (#15439)

Related to orc and text support in https://github.com/rapidsai/cudf/issues/15027

Follow-up work can to enable predicate pushdown and column projection with ORC, but the goal of this PR is basic functionality (and parity with the legacy API).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15439
---
 python/dask_cudf/dask_cudf/__init__.py              |  3 ++-
 python/dask_cudf/dask_cudf/backends.py              |  9 +++++++++
 python/dask_cudf/dask_cudf/expr/_collection.py      | 12 ++++++++++++
 python/dask_cudf/dask_cudf/io/orc.py                |  4 ++--
 python/dask_cudf/dask_cudf/io/tests/test_json.py    |  4 ++--
 python/dask_cudf/dask_cudf/io/tests/test_orc.py     |  4 ++--
 python/dask_cudf/dask_cudf/io/tests/test_parquet.py |  5 +++--
 python/dask_cudf/dask_cudf/io/tests/test_text.py    |  4 ++--
 8 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py
index c66e85ed2af..04c2ad65b99 100644
--- a/python/dask_cudf/dask_cudf/__init__.py
+++ b/python/dask_cudf/dask_cudf/__init__.py
@@ -51,8 +51,9 @@ def inner_func(*args, **kwargs):
     from .expr._collection import DataFrame, Index, Series
 
     groupby_agg = raise_not_implemented_error("groupby_agg")
-    read_text = raise_not_implemented_error("read_text")
+    read_text = DataFrame.read_text
     to_orc = raise_not_implemented_error("to_orc")
+
 else:
     from .core import DataFrame, Index, Series
     from .groupby import groupby_agg
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index d05be30602e..5401bcd3767 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -699,6 +699,15 @@ def read_json(*args, engine="auto", **kwargs):
             **kwargs,
         )
 
+    @staticmethod
+    def read_orc(*args, **kwargs):
+        from dask_expr import from_legacy_dataframe
+
+        from dask_cudf.io.orc import read_orc as legacy_read_orc
+
+        ddf = legacy_read_orc(*args, **kwargs)
+        return from_legacy_dataframe(ddf)
+
 
 # Import/register cudf-specific classes for dask-expr
 try:
diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py
index 799e6eddab3..516e35a4335 100644
--- a/python/dask_cudf/dask_cudf/expr/_collection.py
+++ b/python/dask_cudf/dask_cudf/expr/_collection.py
@@ -81,6 +81,18 @@ def groupby(
             **kwargs,
         )
 
+    def to_orc(self, *args, **kwargs):
+        return self.to_legacy_dataframe().to_orc(*args, **kwargs)
+
+    @staticmethod
+    def read_text(*args, **kwargs):
+        from dask_expr import from_legacy_dataframe
+
+        from dask_cudf.io.text import read_text as legacy_read_text
+
+        ddf = legacy_read_text(*args, **kwargs)
+        return from_legacy_dataframe(ddf)
+
 
 class Series(VarMixin, DXSeries):
     def groupby(self, by, **kwargs):
diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py
index 49fea0d7602..bed69f038b0 100644
--- a/python/dask_cudf/dask_cudf/io/orc.py
+++ b/python/dask_cudf/dask_cudf/io/orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from io import BufferedWriter, IOBase
 
@@ -100,7 +100,7 @@ def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs):
             **kwargs,
         )
 
-    name = "read-orc-" + tokenize(fs_token, path, columns, **kwargs)
+    name = "read-orc-" + tokenize(fs_token, path, columns, filters, **kwargs)
     dsk = {}
     N = 0
     for path, n in zip(paths, nstripes_per_file):
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py
index 8dcf3f05e89..a09dfbff188 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_json.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py
@@ -12,8 +12,8 @@
 import dask_cudf
 from dask_cudf.tests.utils import skip_dask_expr
 
-# No dask-expr support for dask_expr<=1.0.5
-pytestmark = skip_dask_expr(lt_version="1.0.5+a")
+# No dask-expr support for dask_expr<1.0.6
+pytestmark = skip_dask_expr(lt_version="1.0.6")
 
 
 def test_read_json_backend_dispatch(tmp_path):
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
index 8ccb7a7bfe7..7be6c712511 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
@@ -14,8 +14,8 @@
 import dask_cudf
 from dask_cudf.tests.utils import skip_dask_expr
 
-# No dask-expr support
-pytestmark = skip_dask_expr()
+# No dask-expr support for dask_expr<1.0.6
+pytestmark = skip_dask_expr(lt_version="1.0.6")
 
 cur_dir = os.path.dirname(__file__)
 sample_orc = os.path.join(cur_dir, "data/orc/sample.orc")
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index df41ef77b7c..68460653119 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -185,7 +185,6 @@ def test_dask_timeseries_from_dask(tmpdir, index, divisions):
     )
 
 
-@xfail_dask_expr("Categorical column support")
 @pytest.mark.parametrize("index", [False, None])
 @pytest.mark.parametrize("divisions", [False, True])
 def test_dask_timeseries_from_daskcudf(tmpdir, index, divisions):
@@ -193,7 +192,9 @@ def test_dask_timeseries_from_daskcudf(tmpdir, index, divisions):
     ddf2 = dask_cudf.from_cudf(
         cudf.datasets.timeseries(freq="D"), npartitions=4
     )
-    ddf2.name = ddf2.name.astype("object")
+    # Use assign in lieu of `ddf2.name = ...`
+    # See: https://github.com/dask/dask-expr/issues/1010
+    ddf2 = ddf2.assign(name=ddf2.name.astype("object"))
     ddf2.to_parquet(fn, write_index=index)
     read_df = dask_cudf.read_parquet(
         fn, index=index, calculate_divisions=divisions
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py
index d3dcd386d0d..e3a9d380857 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_text.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py
@@ -11,8 +11,8 @@
 import dask_cudf
 from dask_cudf.tests.utils import skip_dask_expr
 
-# No dask-expr support
-pytestmark = skip_dask_expr()
+# No dask-expr support for dask_expr<1.0.6
+pytestmark = skip_dask_expr(lt_version="1.0.6")
 
 cur_dir = os.path.dirname(__file__)
 text_file = os.path.join(cur_dir, "data/text/sample.pgn")

From 3b48f8b0290dc41073538487ad53c8923be2f0f8 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Tue, 9 Apr 2024 13:08:34 -0500
Subject: [PATCH 055/272] Fixed page data truncation in parquet writer under
 certain conditions. (#15474)

Fixes https://github.com/rapidsai/cudf/issues/15473

The issue is that in some cases, for example where we have all nulls, we can fail to update the size of the page output buffer, resulting in a missing byte expected by some readers.   Specifically, we poke the value of dict_bits into the output buffer here:

https://github.com/rapidsai/cudf/blob/6319ab708f2dff9fd7a62a5c77fd3b387bde1bb8/cpp/src/io/parquet/page_enc.cu#L1892

But, if we have no leaf values (for example, because everything in the page is null) `s->cur` never gets updated here, because we never enter the containing loop.

https://github.com/rapidsai/cudf/blob/6319ab708f2dff9fd7a62a5c77fd3b387bde1bb8/cpp/src/io/parquet/page_enc.cu#L1948

The fix is to just always update `s->cur` after this if-else block

https://github.com/rapidsai/cudf/blob/6319ab708f2dff9fd7a62a5c77fd3b387bde1bb8/cpp/src/io/parquet/page_enc.cu#L1891

Note that this was already handled by our reader.  But some third party readers (Trino) are expecting that data to be there and crash if it's not.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/15474
---
 cpp/src/io/parquet/page_enc.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index d881ab6f9b7..114e47b325b 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -1896,6 +1896,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
       s->rle_out     = dst + RLE_LENGTH_FIELD_LEN;
       s->rle_len_pos = dst;
     }
+    s->cur             = s->rle_out;
     s->page_start_val  = row_to_value_idx(s->page.start_row, s->col);
     s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col);
   }

From 15c148dcbba087ed1be32e0cef7188c9b609e7dc Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 9 Apr 2024 17:50:26 -0700
Subject: [PATCH 056/272] Fix for logical and syntactical errors in libcudf c++
 examples (#15346)

This PR fixes a couple of fatal compile and runtime errors in `libcudf/strings` examples

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Mark Harris (https://github.com/harrism)
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15346
---
 cpp/examples/build.sh                    | 4 +++-
 cpp/examples/strings/common.hpp          | 4 +++-
 cpp/examples/strings/custom_optimized.cu | 8 ++++++--
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh
index 001cdeec694..424da35ad18 100755
--- a/cpp/examples/build.sh
+++ b/cpp/examples/build.sh
@@ -1,9 +1,11 @@
 #!/bin/bash
 
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 # libcudf examples build script
 
+set -euo pipefail
+
 # Parallelism control
 PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
 
diff --git a/cpp/examples/strings/common.hpp b/cpp/examples/strings/common.hpp
index 0dbe6fe2b7b..65a9c100c7c 100644
--- a/cpp/examples/strings/common.hpp
+++ b/cpp/examples/strings/common.hpp
@@ -19,6 +19,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/io/csv.hpp>
 #include <cudf/io/datasource.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
@@ -110,7 +111,8 @@ int main(int argc, char const** argv)
 
   std::chrono::duration<double> elapsed = std::chrono::steady_clock::now() - st;
   std::cout << "Wall time: " << elapsed.count() << " seconds\n";
-  std::cout << "Output size " << result->view().child(1).size() << " bytes\n";
+  auto const scv = cudf::strings_column_view(result->view());
+  std::cout << "Output size " << scv.chars_size(rmm::cuda_stream_default) << " bytes\n";
 
   return 0;
 }
diff --git a/cpp/examples/strings/custom_optimized.cu b/cpp/examples/strings/custom_optimized.cu
index cefa3346150..62ca19a5ca9 100644
--- a/cpp/examples/strings/custom_optimized.cu
+++ b/cpp/examples/strings/custom_optimized.cu
@@ -153,8 +153,12 @@ std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
   redact_kernel<<<blocks, block_size, 0, stream.value()>>>(
     *d_names, *d_visibilities, offsets.data(), chars.data());
 
-  // create column from offsets and chars vectors (no copy is performed)
-  auto result = cudf::make_strings_column(names.size(), std::move(offsets), chars.release(), {}, 0);
+  // create column from offsets vector (move only)
+  auto offsets_column = std::make_unique<cudf::column>(std::move(offsets), rmm::device_buffer{}, 0);
+
+  // create column for chars vector (no copy is performed)
+  auto result = cudf::make_strings_column(
+    names.size(), std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});
 
   // wait for all of the above to finish
   stream.synchronize();

From b06536d3c061d62286c6844ed8d6a69cf906dc3d Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 10 Apr 2024 08:56:47 -0500
Subject: [PATCH 057/272] Make improvements in pandas-test reporting (#15485)

This PR fixes an issue where `listJobsForWorkflowRun` returns only 30 jobs details by default and we need to paginate and load the rest all of the job details to be able to filter jobs.

This PR also address review comments in https://github.com/rapidsai/cudf/pull/15369/

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15485
---
 .github/workflows/status.yaml               | 13 +++++++++----
 .github/workflows/test.yaml                 |  2 +-
 ci/cudf_pandas_scripts/pandas-tests/diff.sh |  9 +++++----
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/status.yaml b/.github/workflows/status.yaml
index 0aad4c8a23e..781264bc55e 100644
--- a/.github/workflows/status.yaml
+++ b/.github/workflows/status.yaml
@@ -85,13 +85,18 @@ jobs:
                 state: CUSTOM_STATE = 'success'
             } = contentJSON;
 
-            // Fetch the first job ID from the workflow run
-            const jobs = await github.rest.actions.listJobsForWorkflowRun({
+            // Fetch all jobs using pagination
+            const jobs = await github.paginate(
+              github.rest.actions.listJobsForWorkflowRun,
+              {
                 owner: context.repo.owner,
                 repo: context.repo.repo,
                 run_id: process.env.WORKFLOW_RUN_ID,
-            });
-            const job = jobs.data.jobs.find(job => job.name === JOB_NAME);
+              }
+            );
+
+            // Fetch the first job ID from the workflow run
+            const job = jobs.find(job => job.name === JOB_NAME);
             const JOB_ID = job ? job.id : null;
 
             // Set default target URL if not defined
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index ea47b6ad466..65aef37697e 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -130,7 +130,7 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(min_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
diff --git a/ci/cudf_pandas_scripts/pandas-tests/diff.sh b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
index ae5a249bcbd..cf80f383db4 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/diff.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
@@ -10,12 +10,13 @@
 GH_JOB_NAME="pandas-tests-diff / build"
 rapids-logger "Github job name: ${GH_JOB_NAME}"
 
-MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py310.main-results.json
-PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py39.pr-results.json
+PY_VER="39"
+MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-results.json
+PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-results.json
 
 rapids-logger "Fetching latest available results from nightly"
-aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '.main-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt
-cat s3_output.txt
+aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt
+
 read -r COMPARE_ENV < s3_output.txt
 export COMPARE_ENV
 rapids-logger "Latest available results from nightly: ${COMPARE_ENV}"

From 460b41edadc90a43b02b1f1e7dc23190cc14d0b4 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 10 Apr 2024 05:47:58 -1000
Subject: [PATCH 058/272] Use less _is_categorical_dtype (#15148)

Rehash of https://github.com/rapidsai/cudf/pull/14942

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15148
---
 python/cudf/cudf/_fuzz_testing/csv.py  |  2 +-
 python/cudf/cudf/_fuzz_testing/json.py |  2 +-
 python/cudf/cudf/_lib/csv.pyx          | 15 +++---
 python/cudf/cudf/core/column/column.py |  7 +--
 python/cudf/cudf/core/dtypes.py        | 10 +++-
 python/cudf/cudf/testing/testing.py    | 24 +++++-----
 python/cudf/cudf/tests/test_column.py  |  4 +-
 python/cudf/cudf/tests/test_concat.py  | 66 ++++++++------------------
 python/cudf/cudf/tests/test_csv.py     | 22 +++++++--
 python/cudf/cudf/utils/dtypes.py       |  4 +-
 10 files changed, 78 insertions(+), 78 deletions(-)

diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py
index 5b49143fd5a..67211a1c4bf 100644
--- a/python/cudf/cudf/_fuzz_testing/csv.py
+++ b/python/cudf/cudf/_fuzz_testing/csv.py
@@ -99,7 +99,7 @@ def set_rand_params(self, params):
                     if dtype_val is not None:
                         dtype_val = {
                             col_name: "category"
-                            if cudf.utils.dtypes._is_categorical_dtype(dtype)
+                            if isinstance(dtype, cudf.CategoricalDtype)
                             else pandas_dtypes_to_np_dtypes[dtype]
                             for col_name, dtype in dtype_val.items()
                         }
diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py
index bffd508b2ef..e987529c8ba 100644
--- a/python/cudf/cudf/_fuzz_testing/json.py
+++ b/python/cudf/cudf/_fuzz_testing/json.py
@@ -27,7 +27,7 @@ def _get_dtype_param_value(dtype_val):
     if dtype_val is not None and isinstance(dtype_val, abc.Mapping):
         processed_dtypes = {}
         for col_name, dtype in dtype_val.items():
-            if cudf.utils.dtypes._is_categorical_dtype(dtype):
+            if isinstance(dtype, cudf.CategoricalDtype):
                 processed_dtypes[col_name] = "category"
             else:
                 processed_dtypes[col_name] = str(
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index 0f0bc3ce81a..b2e4d442bd2 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -434,7 +434,7 @@ def read_csv(
     if dtype is not None:
         if isinstance(dtype, abc.Mapping):
             for k, v in dtype.items():
-                if cudf.api.types._is_categorical_dtype(v):
+                if isinstance(cudf.dtype(v), cudf.CategoricalDtype):
                     df._data[str(k)] = df._data[str(k)].astype(v)
         elif (
             cudf.api.types.is_scalar(dtype) or
@@ -442,11 +442,11 @@ def read_csv(
                 np.dtype, pd.api.extensions.ExtensionDtype, type
             ))
         ):
-            if cudf.api.types._is_categorical_dtype(dtype):
+            if isinstance(cudf.dtype(dtype), cudf.CategoricalDtype):
                 df = df.astype(dtype)
         elif isinstance(dtype, abc.Collection):
             for index, col_dtype in enumerate(dtype):
-                if cudf.api.types._is_categorical_dtype(col_dtype):
+                if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype):
                     col_name = df._data.names[index]
                     df._data[col_name] = df._data[col_name].astype(col_dtype)
 
@@ -554,11 +554,10 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
     # TODO: Remove this work-around Dictionary types
     # in libcudf are fully mapped to categorical columns:
     # https://github.com/rapidsai/cudf/issues/3960
-    if cudf.api.types._is_categorical_dtype(dtype):
-        if isinstance(dtype, str):
-            dtype = "str"
-        else:
-            dtype = dtype.categories.dtype
+    if isinstance(dtype, cudf.CategoricalDtype):
+        dtype = dtype.categories.dtype
+    elif dtype == "category":
+        dtype = "str"
 
     if isinstance(dtype, str):
         if str(dtype) == "date32":
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 67f44ad2f48..c8a6493ddda 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -52,7 +52,6 @@
 from cudf._lib.types import size_type_dtype
 from cudf._typing import ColumnLike, Dtype, ScalarLike
 from cudf.api.types import (
-    _is_categorical_dtype,
     _is_non_decimal_numeric_dtype,
     _is_pandas_nullable_extension_dtype,
     infer_dtype,
@@ -1381,7 +1380,7 @@ def column_empty_like(
 
     if (
         hasattr(column, "dtype")
-        and _is_categorical_dtype(column.dtype)
+        and isinstance(column.dtype, cudf.CategoricalDtype)
         and dtype == column.dtype
     ):
         catcolumn = cast("cudf.core.column.CategoricalColumn", column)
@@ -2008,7 +2007,9 @@ def as_column(
             length = 1
         elif length < 0:
             raise ValueError(f"{length=} must be >=0.")
-        if isinstance(arbitrary, pd.Interval) or _is_categorical_dtype(dtype):
+        if isinstance(
+            arbitrary, pd.Interval
+        ) or cudf.api.types._is_categorical_dtype(dtype):
             # No cudf.Scalar support yet
             return as_column(
                 pd.Series([arbitrary] * length),
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 73617763221..9bb1995b836 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -51,6 +51,11 @@ def dtype(arbitrary):
             raise TypeError(f"Unsupported type {np_dtype}")
         return np_dtype
 
+    if isinstance(arbitrary, str) and arbitrary in {"hex", "hex32", "hex64"}:
+        # read_csv only accepts "hex"
+        # e.g. test_csv_reader_hexadecimals, test_csv_reader_hexadecimal_overflow
+        return arbitrary
+
     # use `pandas_dtype` to try and interpret
     # `arbitrary` as a Pandas extension type.
     #  Return the corresponding NumPy/cuDF type.
@@ -999,7 +1004,10 @@ def _is_categorical_dtype(obj):
             pd.Series,
         ),
     ):
-        return _is_categorical_dtype(obj.dtype)
+        try:
+            return isinstance(cudf.dtype(obj.dtype), cudf.CategoricalDtype)
+        except TypeError:
+            return False
     if hasattr(obj, "type"):
         if obj.type is pd.CategoricalDtype.type:
             return True
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index fc253c5c197..dffbbe92fc1 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -8,11 +8,7 @@
 
 import cudf
 from cudf._lib.unary import is_nan
-from cudf.api.types import (
-    _is_categorical_dtype,
-    is_numeric_dtype,
-    is_string_dtype,
-)
+from cudf.api.types import is_numeric_dtype, is_string_dtype
 from cudf.core.missing import NA, NaT
 
 
@@ -86,7 +82,7 @@ def _check_types(
     if (
         exact
         and not isinstance(left, cudf.MultiIndex)
-        and _is_categorical_dtype(left)
+        and isinstance(left.dtype, cudf.CategoricalDtype)
     ):
         if left.dtype != right.dtype:
             raise_assert_detail(
@@ -144,8 +140,8 @@ def assert_column_equal(
     """
     if check_dtype is True:
         if (
-            _is_categorical_dtype(left)
-            and _is_categorical_dtype(right)
+            isinstance(left.dtype, cudf.CategoricalDtype)
+            and isinstance(right.dtype, cudf.CategoricalDtype)
             and not check_categorical
         ):
             pass
@@ -173,7 +169,9 @@ def assert_column_equal(
             return
 
     if check_exact and check_categorical:
-        if _is_categorical_dtype(left) and _is_categorical_dtype(right):
+        if isinstance(left.dtype, cudf.CategoricalDtype) and isinstance(
+            right.dtype, cudf.CategoricalDtype
+        ):
             left_cat = left.categories
             right_cat = right.categories
 
@@ -207,8 +205,8 @@ def assert_column_equal(
 
     if (
         not check_dtype
-        and _is_categorical_dtype(left)
-        and _is_categorical_dtype(right)
+        and isinstance(left.dtype, cudf.CategoricalDtype)
+        and isinstance(right.dtype, cudf.CategoricalDtype)
     ):
         left = left.astype(left.categories.dtype)
         right = right.astype(right.categories.dtype)
@@ -258,7 +256,9 @@ def assert_column_equal(
                 raise e
             else:
                 columns_equal = False
-            if _is_categorical_dtype(left) and _is_categorical_dtype(right):
+            if isinstance(left.dtype, cudf.CategoricalDtype) and isinstance(
+                right.dtype, cudf.CategoricalDtype
+            ):
                 left = left.astype(left.categories.dtype)
                 right = right.astype(right.categories.dtype)
     if not columns_equal:
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index 2f70f955fa9..dace8009041 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -81,7 +81,7 @@ def test_column_offset_and_size(pandas_input, offset, size):
         children=col.base_children,
     )
 
-    if cudf.api.types._is_categorical_dtype(col.dtype):
+    if isinstance(col.dtype, cudf.CategoricalDtype):
         assert col.size == col.codes.size
         assert col.size == (col.codes.data.size / col.codes.dtype.itemsize)
     elif cudf.api.types.is_string_dtype(col.dtype):
@@ -120,7 +120,7 @@ def column_slicing_test(col, offset, size, cast_to_float=False):
     else:
         pd_series = series.to_pandas()
 
-    if cudf.api.types._is_categorical_dtype(col.dtype):
+    if isinstance(col.dtype, cudf.CategoricalDtype):
         # The cudf.Series is constructed from an already sliced column, whereas
         # the pandas.Series is constructed from the unsliced series and then
         # sliced, so the indexes should be different and we must ignore it.
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index 3d638da924b..87b3beb5589 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -9,7 +9,6 @@
 import pytest
 
 import cudf
-from cudf.api.types import _is_categorical_dtype
 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
 from cudf.testing._utils import (
     assert_eq,
@@ -609,8 +608,8 @@ def test_concat_empty_dataframes(df, other, ignore_index):
     actual = cudf.concat(other_gd, ignore_index=ignore_index)
     if expected.shape != df.shape:
         for key, col in actual[actual.columns].items():
-            if _is_categorical_dtype(col.dtype):
-                if not _is_categorical_dtype(expected[key].dtype):
+            if isinstance(col.dtype, cudf.CategoricalDtype):
+                if not isinstance(expected[key].dtype, pd.CategoricalDtype):
                     # TODO: Pandas bug:
                     # https://github.com/pandas-dev/pandas/issues/42840
                     expected[key] = expected[key].fillna("-1").astype("str")
@@ -1195,10 +1194,10 @@ def test_concat_join_series(ignore_index, sort, join, axis):
 @pytest.mark.parametrize("ignore_index", [True, False])
 @pytest.mark.parametrize("sort", [True, False])
 @pytest.mark.parametrize("join", ["inner", "outer"])
-@pytest.mark.parametrize("axis", [0])
 def test_concat_join_empty_dataframes(
-    df, other, ignore_index, axis, join, sort
+    request, df, other, ignore_index, join, sort
 ):
+    axis = 0
     other_pd = [df] + other
     gdf = cudf.from_pandas(df)
     other_gd = [gdf] + [cudf.from_pandas(o) for o in other]
@@ -1209,50 +1208,27 @@ def test_concat_join_empty_dataframes(
     actual = cudf.concat(
         other_gd, ignore_index=ignore_index, axis=axis, join=join, sort=sort
     )
-    if expected.shape != df.shape:
-        if axis == 0:
-            for key, col in actual[actual.columns].items():
-                if _is_categorical_dtype(col.dtype):
-                    if not _is_categorical_dtype(expected[key].dtype):
-                        # TODO: Pandas bug:
-                        # https://github.com/pandas-dev/pandas/issues/42840
-                        expected[key] = (
-                            expected[key].fillna("-1").astype("str")
-                        )
-                    else:
-                        expected[key] = (
-                            expected[key]
-                            .cat.add_categories(["-1"])
-                            .fillna("-1")
-                            .astype("str")
-                        )
-                    actual[key] = col.astype("str").fillna("-1")
-                else:
-                    expected[key] = expected[key].fillna(-1)
-                    actual[key] = col.fillna(-1)
-
-            assert_eq(
-                expected.fillna(-1),
-                actual.fillna(-1),
-                check_dtype=False,
-                check_index_type=False
-                if len(expected) == 0 or actual.empty
-                else True,
-                check_column_type=False,
-            )
-        else:
-            # no need to fill in if axis=1
-            assert_eq(
-                expected,
-                actual,
-                check_index_type=False,
-                check_column_type=False,
+    if (
+        join == "outer"
+        and any(
+            isinstance(dtype, pd.CategoricalDtype)
+            for dtype in df.dtypes.tolist()
+        )
+        and any(
+            isinstance(dtype, pd.CategoricalDtype)
+            for other_df in other
+            for dtype in other_df.dtypes.tolist()
+        )
+    ):
+        request.applymarker(
+            pytest.mark.xfail(
+                reason="https://github.com/pandas-dev/pandas/issues/42840"
             )
+        )
     assert_eq(
         expected,
         actual,
         check_dtype=False,
-        check_index_type=False,
         check_column_type=False,
     )
 
@@ -1332,7 +1308,7 @@ def test_concat_join_empty_dataframes_axis_1(
     if expected.shape != df.shape:
         if axis == 0:
             for key, col in actual[actual.columns].items():
-                if _is_categorical_dtype(col.dtype):
+                if isinstance(expected[key].dtype, pd.CategoricalDtype):
                     expected[key] = expected[key].fillna("-1")
                     actual[key] = col.astype("str").fillna("-1")
             # if not expected.empty:
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 2d728fb94ba..5009a7f2628 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -272,14 +272,30 @@ def test_csv_reader_mixed_data_delimiter_sep(
     gdf1 = read_csv(
         str(fname),
         names=["1", "2", "3", "4", "5", "6", "7"],
-        dtype=["int64", "date", "float64", "int64", "category", "str", "bool"],
+        dtype=[
+            "int64",
+            "datetime64[ns]",
+            "float64",
+            "int64",
+            "category",
+            "str",
+            "bool",
+        ],
         dayfirst=True,
         **cudf_arg,
     )
     gdf2 = read_csv(
         str(fname),
         names=["1", "2", "3", "4", "5", "6", "7"],
-        dtype=["int64", "date", "float64", "int64", "category", "str", "bool"],
+        dtype=[
+            "int64",
+            "datetime64[ns]",
+            "float64",
+            "int64",
+            "category",
+            "str",
+            "bool",
+        ],
         dayfirst=True,
         **pandas_arg,
     )
@@ -368,7 +384,7 @@ def test_csv_reader_skiprows_skipfooter(tmpdir, pd_mixed_dataframe):
     out = read_csv(
         str(fname),
         names=["1", "2", "3"],
-        dtype=["int64", "date", "float64"],
+        dtype=["int64", "datetime64[ns]", "float64"],
         skiprows=1,
         skipfooter=1,
         dayfirst=True,
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 8521239413e..a33b5ca139c 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -392,9 +392,9 @@ def get_min_float_dtype(col):
 
 
 def is_mixed_with_object_dtype(lhs, rhs):
-    if cudf.api.types._is_categorical_dtype(lhs.dtype):
+    if isinstance(lhs.dtype, cudf.CategoricalDtype):
         return is_mixed_with_object_dtype(lhs.dtype.categories, rhs)
-    elif cudf.api.types._is_categorical_dtype(rhs.dtype):
+    elif isinstance(rhs.dtype, cudf.CategoricalDtype):
         return is_mixed_with_object_dtype(lhs, rhs.dtype.categories)
 
     return (lhs.dtype == "object" and rhs.dtype != "object") or (

From 888e9d5c38cb27402313681744b87462846bc405 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Wed, 10 Apr 2024 17:56:10 -0400
Subject: [PATCH 059/272] Floating <--> fixed-point conversion must now be
 called explicitly (#15438)

This change makes it so fixed_point objects can no longer be constructed with floating point values, and can no longer be casted to floating point values.  Instead the functions added to unary.hpp must be explicitly called.

In addition to making it more clear when and where these conversions are occurring, this also makes it so that the low-level fixed_point.hpp header won't be inundated with all of the complex lossless conversion code to come.

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Shruti Shivakumar (https://github.com/shrshi)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/15438
---
 cpp/include/cudf/fixed_point/fixed_point.hpp  |  49 +----
 cpp/include/cudf/unary.hpp                    |  75 ++++++-
 cpp/include/cudf/utilities/traits.hpp         |   7 +-
 cpp/src/binaryop/compiled/binary_ops.cuh      |  19 +-
 cpp/src/quantiles/quantiles_util.hpp          |   9 +-
 .../quantiles/tdigest/tdigest_aggregation.cu  |  14 +-
 cpp/src/unary/cast_ops.cu                     |  16 +-
 cpp/tests/fixed_point/fixed_point_tests.cpp   | 189 +++++++++---------
 cpp/tests/io/orc_test.cpp                     |   2 +-
 9 files changed, 219 insertions(+), 161 deletions(-)

diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index 4445af6c5a8..e39d75757e8 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -67,18 +67,6 @@ constexpr inline auto is_supported_representation_type()
          cuda::std::is_same_v<T, __int128_t>;
 }
 
-/**
- * @brief Returns `true` if the value type is supported for constructing a `fixed_point`
- *
- * @tparam T The construction value type
- * @return `true` if the value type is supported to construct a `fixed_point` type
- */
-template <typename T>
-constexpr inline auto is_supported_construction_value_type()
-{
-  return cuda::std::is_integral<T>() || cuda::std::is_floating_point_v<T>;
-}
-
 /** @} */  // end of group
 
 // Helper functions for `fixed_point` type
@@ -222,23 +210,8 @@ class fixed_point {
   scale_type _scale;
 
  public:
-  using rep = Rep;  ///< The representation type
-
-  /**
-   * @brief Constructor that will perform shifting to store value appropriately (from floating point
-   * types)
-   *
-   * @tparam T The floating point type that you are constructing from
-   * @param value The value that will be constructed from
-   * @param scale The exponent that is applied to Rad to perform shifting
-   */
-  template <typename T,
-            typename cuda::std::enable_if_t<cuda::std::is_floating_point<T>() &&
-                                            is_supported_representation_type<Rep>()>* = nullptr>
-  CUDF_HOST_DEVICE inline explicit fixed_point(T const& value, scale_type const& scale)
-    : _value{static_cast<Rep>(detail::shift<Rep, Rad>(value, scale))}, _scale{scale}
-  {
-  }
+  using rep                 = Rep;  ///< The representation type
+  static constexpr auto rad = Rad;  ///< The base
 
   /**
    * @brief Constructor that will perform shifting to store value appropriately (from integral
@@ -249,7 +222,7 @@ class fixed_point {
    * @param scale The exponent that is applied to Rad to perform shifting
    */
   template <typename T,
-            typename cuda::std::enable_if_t<cuda::std::is_integral<T>() &&
+            typename cuda::std::enable_if_t<cuda::std::is_integral_v<T> &&
                                             is_supported_representation_type<Rep>()>* = nullptr>
   CUDF_HOST_DEVICE inline explicit fixed_point(T const& value, scale_type const& scale)
     // `value` is cast to `Rep` to avoid overflow in cases where
@@ -275,8 +248,7 @@ class fixed_point {
    * @tparam T The value type being constructing from
    * @param value The value that will be constructed from
    */
-  template <typename T,
-            typename cuda::std::enable_if_t<is_supported_construction_value_type<T>()>* = nullptr>
+  template <typename T, typename cuda::std::enable_if_t<cuda::std::is_integral_v<T>>* = nullptr>
   CUDF_HOST_DEVICE inline fixed_point(T const& value)
     : _value{static_cast<Rep>(value)}, _scale{scale_type{0}}
   {
@@ -288,19 +260,6 @@ class fixed_point {
    */
   CUDF_HOST_DEVICE inline fixed_point() : _scale{scale_type{0}} {}
 
-  /**
-   * @brief Explicit conversion operator for casting to floating point types
-   *
-   * @tparam U The floating point type that is being explicitly converted to
-   * @return The `fixed_point` number in base 10 (aka human readable format)
-   */
-  template <typename U,
-            typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<U>>* = nullptr>
-  explicit constexpr operator U() const
-  {
-    return detail::shift<Rep, Rad>(static_cast<U>(_value), scale_type{-_scale});
-  }
-
   /**
    * @brief Explicit conversion operator for casting to integral types
    *
diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
index 64e802d88dd..5ded22488c7 100644
--- a/cpp/include/cudf/unary.hpp
+++ b/cpp/include/cudf/unary.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,10 @@
 
 #pragma once
 
+#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/traits.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
@@ -31,6 +33,77 @@ namespace cudf {
  * @brief Column APIs for unary ops
  */
 
+/**
+ * @brief Convert a floating-point value to fixed point
+ *
+ * @note This conversion was moved from fixed-point member functions to free functions.
+ * This is so that the complex conversion code is not included into many parts of the
+ * code base that don't need it, and so that it's more obvious to pinpoint where these
+ * conversions are occurring.
+ *
+ * @tparam Fixed The fixed-point type to convert to
+ * @tparam Floating The floating-point type to convert from
+ * @param floating The floating-point value to convert
+ * @param scale The desired scale of the fixed-point value
+ * @return The converted fixed-point value
+ */
+template <typename Fixed,
+          typename Floating,
+          typename cuda::std::enable_if_t<is_fixed_point<Fixed>() &&
+                                          cuda::std::is_floating_point_v<Floating>>* = nullptr>
+CUDF_HOST_DEVICE Fixed convert_floating_to_fixed(Floating floating, numeric::scale_type scale)
+{
+  using Rep          = typename Fixed::rep;
+  auto const shifted = numeric::detail::shift<Rep, Fixed::rad>(floating, scale);
+  numeric::scaled_integer<Rep> scaled{static_cast<Rep>(shifted), scale};
+  return Fixed(scaled);
+}
+
+/**
+ * @brief Convert a fixed-point value to floating point
+ *
+ * @note This conversion was moved from fixed-point member functions to free functions.
+ * This is so that the complex conversion code is not included into many parts of the
+ * code base that don't need it, and so that it's more obvious to pinpoint where these
+ * conversions are occurring.
+ *
+ * @tparam Floating The floating-point type to convert to
+ * @tparam Fixed The fixed-point type to convert from
+ * @param fixed The fixed-point value to convert
+ * @return The converted floating-point value
+ */
+template <typename Floating,
+          typename Fixed,
+          typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<Floating> &&
+                                          is_fixed_point<Fixed>()>* = nullptr>
+CUDF_HOST_DEVICE Floating convert_fixed_to_floating(Fixed fixed)
+{
+  using Rep         = typename Fixed::rep;
+  auto const casted = static_cast<Floating>(fixed.value());
+  auto const scale  = numeric::scale_type{-fixed.scale()};
+  return numeric::detail::shift<Rep, Fixed::rad>(casted, scale);
+}
+
+/**
+ * @brief Convert a value to floating point
+ *
+ * @tparam Floating The floating-point type to convert to
+ * @tparam Input The input type to convert from
+ * @param input The input value to convert
+ * @return The converted floating-point value
+ */
+template <typename Floating,
+          typename Input,
+          typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<Floating>>* = nullptr>
+CUDF_HOST_DEVICE Floating convert_to_floating(Input input)
+{
+  if constexpr (is_fixed_point<Input>()) {
+    return convert_fixed_to_floating<Floating>(input);
+  } else {
+    return static_cast<Floating>(input);
+  }
+}
+
 /**
  * @brief Types of unary operations that can be performed on data.
  */
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index 2dda0740b96..d191e44228a 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -397,7 +397,10 @@ template <typename T>
 constexpr inline bool is_fixed_point()
 {
   return std::is_same_v<numeric::decimal32, T> || std::is_same_v<numeric::decimal64, T> ||
-         std::is_same_v<numeric::decimal128, T>;
+         std::is_same_v<numeric::decimal128, T> ||
+         std::is_same_v<numeric::fixed_point<int32_t, numeric::Radix::BASE_2>, T> ||
+         std::is_same_v<numeric::fixed_point<int64_t, numeric::Radix::BASE_2>, T> ||
+         std::is_same_v<numeric::fixed_point<__int128_t, numeric::Radix::BASE_2>, T>;
 }
 
 /**
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index d605c877d3f..0bc144baa83 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -22,6 +22,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/unary.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -69,13 +70,17 @@ struct typed_casted_writer {
     if constexpr (mutable_column_device_view::has_element_accessor<Element>() and
                   std::is_constructible_v<Element, FromType>) {
       col.element<Element>(i) = static_cast<Element>(val);
-    } else if constexpr (is_fixed_point<Element>() and
-                         (is_fixed_point<FromType>() or
-                          std::is_constructible_v<Element, FromType>)) {
-      if constexpr (is_fixed_point<FromType>())
-        col.data<Element::rep>()[i] = val.rescaled(numeric::scale_type{col.type().scale()}).value();
-      else
-        col.data<Element::rep>()[i] = Element{val, numeric::scale_type{col.type().scale()}}.value();
+    } else if constexpr (is_fixed_point<Element>()) {
+      auto const scale = numeric::scale_type{col.type().scale()};
+      if constexpr (is_fixed_point<FromType>()) {
+        col.data<Element::rep>()[i] = val.rescaled(scale).value();
+      } else if constexpr (cuda::std::is_constructible_v<Element, FromType>) {
+        col.data<Element::rep>()[i] = Element{val, scale}.value();
+      } else if constexpr (cuda::std::is_floating_point_v<FromType>) {
+        col.data<Element::rep>()[i] = convert_floating_to_fixed<Element>(val, scale).value();
+      }
+    } else if constexpr (cuda::std::is_floating_point_v<Element> and is_fixed_point<FromType>()) {
+      col.data<Element>()[i] = convert_fixed_to_floating<Element>(val);
     }
   }
 };
diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp
index 5efafdd0be6..47864c25c5f 100644
--- a/cpp/src/quantiles/quantiles_util.hpp
+++ b/cpp/src/quantiles/quantiles_util.hpp
@@ -16,6 +16,7 @@
 
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/types.hpp>
+#include <cudf/unary.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
@@ -46,8 +47,8 @@ CUDF_HOST_DEVICE inline Result linear(T lhs, T rhs, double frac)
   // Underflow may occur when converting int64 to double
   // detail: https://github.com/rapidsai/cudf/issues/1417
 
-  auto dlhs             = static_cast<double>(lhs);
-  auto drhs             = static_cast<double>(rhs);
+  auto dlhs             = convert_to_floating<double>(lhs);
+  auto drhs             = convert_to_floating<double>(rhs);
   double one_minus_frac = 1.0 - frac;
   return static_cast<Result>(one_minus_frac * dlhs + frac * drhs);
 }
@@ -56,8 +57,8 @@ template <typename Result, typename T>
 CUDF_HOST_DEVICE inline Result midpoint(T lhs, T rhs)
 {
   // TODO: try std::midpoint (C++20) if available
-  auto dlhs = static_cast<double>(lhs);
-  auto drhs = static_cast<double>(rhs);
+  auto dlhs = convert_to_floating<double>(lhs);
+  auto drhs = convert_to_floating<double>(rhs);
   return static_cast<Result>(dlhs / 2 + drhs / 2);
 }
 
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index 56e1bfbe003..8544d9caa56 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -28,6 +28,7 @@
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/unary.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -73,7 +74,7 @@ struct make_centroid {
   centroid operator() __device__(size_type index) const
   {
     auto const is_valid = col.is_valid(index);
-    auto const mean     = is_valid ? static_cast<double>(col.element<T>(index)) : 0.0;
+    auto const mean     = is_valid ? convert_to_floating<double>(col.element<T>(index)) : 0.0;
     auto const weight   = is_valid ? 1.0 : 0.0;
     return {mean, weight, is_valid};
   }
@@ -87,7 +88,7 @@ struct make_centroid_no_nulls {
 
   centroid operator() __device__(size_type index) const
   {
-    return {static_cast<double>(col.element<T>(index)), 1.0, true};
+    return {convert_to_floating<double>(col.element<T>(index)), 1.0, true};
   }
 };
 
@@ -808,8 +809,9 @@ struct get_scalar_minmax_grouped {
     auto const valid_count = group_valid_counts[group_index];
     return valid_count > 0
              ? thrust::make_tuple(
-                 static_cast<double>(col.element<T>(group_offsets[group_index])),
-                 static_cast<double>(col.element<T>(group_offsets[group_index] + valid_count - 1)))
+                 convert_to_floating<double>(col.element<T>(group_offsets[group_index])),
+                 convert_to_floating<double>(
+                   col.element<T>(group_offsets[group_index] + valid_count - 1)))
              : thrust::make_tuple(0.0, 0.0);
   }
 };
@@ -823,8 +825,8 @@ struct get_scalar_minmax {
   __device__ thrust::tuple<double, double> operator()(size_type)
   {
     return valid_count > 0
-             ? thrust::make_tuple(static_cast<double>(col.element<T>(0)),
-                                  static_cast<double>(col.element<T>(valid_count - 1)))
+             ? thrust::make_tuple(convert_to_floating<double>(col.element<T>(0)),
+                                  convert_to_floating<double>(col.element<T>(valid_count - 1)))
              : thrust::make_tuple(0.0, 0.0);
   }
 };
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index 47a0cb393aa..b6c9b3caa20 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -116,8 +116,12 @@ struct fixed_point_unary_cast {
     std::enable_if_t<(cudf::is_fixed_point<_SourceT>() && cudf::is_numeric<TargetT>())>* = nullptr>
   __device__ inline TargetT operator()(DeviceT const element)
   {
-    auto const fp = SourceT{numeric::scaled_integer<DeviceT>{element, scale}};
-    return static_cast<TargetT>(fp);
+    auto const fixed_point = SourceT{numeric::scaled_integer<DeviceT>{element, scale}};
+    if constexpr (cuda::std::is_floating_point_v<TargetT>) {
+      return convert_fixed_to_floating<TargetT>(fixed_point);
+    } else {
+      return static_cast<TargetT>(fixed_point);
+    }
   }
 
   template <
@@ -126,7 +130,11 @@ struct fixed_point_unary_cast {
     std::enable_if_t<(cudf::is_numeric<_SourceT>() && cudf::is_fixed_point<TargetT>())>* = nullptr>
   __device__ inline DeviceT operator()(SourceT const element)
   {
-    return TargetT{element, scale}.value();
+    if constexpr (cuda::std::is_floating_point_v<SourceT>) {
+      return convert_floating_to_fixed<TargetT>(element, scale).value();
+    } else {
+      return TargetT{element, scale}.value();
+    }
   }
 };
 
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp
index 1c1680fcd6e..73de1fbaa68 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cpp
+++ b/cpp/tests/fixed_point/fixed_point_tests.cpp
@@ -23,6 +23,7 @@
 #include <cudf/binaryop.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/null_mask.hpp>
+#include <cudf/unary.hpp>
 
 #include <algorithm>
 #include <limits>
@@ -45,67 +46,71 @@ TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXConstruction)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
-  decimalXX num0{1.234567, scale_type{0}};
-  decimalXX num1{1.234567, scale_type{-1}};
-  decimalXX num2{1.234567, scale_type{-2}};
-  decimalXX num3{1.234567, scale_type{-3}};
-  decimalXX num4{1.234567, scale_type{-4}};
-  decimalXX num5{1.234567, scale_type{-5}};
-  decimalXX num6{1.234567, scale_type{-6}};
-
-  EXPECT_EQ(1, static_cast<double>(num0));
-  EXPECT_EQ(1.2, static_cast<double>(num1));
-  EXPECT_EQ(1.23, static_cast<double>(num2));
-  EXPECT_EQ(1.234, static_cast<double>(num3));
-  EXPECT_EQ(1.2345, static_cast<double>(num4));
-  EXPECT_EQ(1.23456, static_cast<double>(num5));
-  EXPECT_EQ(1.234567, static_cast<double>(num6));
+  auto num0 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(0));
+  auto num1 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-1));
+  auto num2 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-2));
+  auto num3 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-3));
+  auto num4 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-4));
+  auto num5 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-5));
+  auto num6 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-6));
+
+  EXPECT_EQ(1, cudf::convert_fixed_to_floating<double>(num0));
+  EXPECT_EQ(1.2, cudf::convert_fixed_to_floating<double>(num1));
+  EXPECT_EQ(1.23, cudf::convert_fixed_to_floating<double>(num2));
+  EXPECT_EQ(1.234, cudf::convert_fixed_to_floating<double>(num3));
+  EXPECT_EQ(1.2345, cudf::convert_fixed_to_floating<double>(num4));
+  EXPECT_EQ(1.23456, cudf::convert_fixed_to_floating<double>(num5));
+  EXPECT_EQ(1.234567, cudf::convert_fixed_to_floating<double>(num6));
 }
 
 TYPED_TEST(FixedPointTestAllReps, SimpleNegativeDecimalXXConstruction)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
-  decimalXX num0{-1.234567, scale_type{0}};
-  decimalXX num1{-1.234567, scale_type{-1}};
-  decimalXX num2{-1.234567, scale_type{-2}};
-  decimalXX num3{-1.234567, scale_type{-3}};
-  decimalXX num4{-1.234567, scale_type{-4}};
-  decimalXX num5{-1.234567, scale_type{-5}};
-  decimalXX num6{-1.234567, scale_type{-6}};
-
-  EXPECT_EQ(-1, static_cast<double>(num0));
-  EXPECT_EQ(-1.2, static_cast<double>(num1));
-  EXPECT_EQ(-1.23, static_cast<double>(num2));
-  EXPECT_EQ(-1.234, static_cast<double>(num3));
-  EXPECT_EQ(-1.2345, static_cast<double>(num4));
-  EXPECT_EQ(-1.23456, static_cast<double>(num5));
-  EXPECT_EQ(-1.234567, static_cast<double>(num6));
+  auto num0 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(0));
+  auto num1 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-1));
+  auto num2 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-2));
+  auto num3 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-3));
+  auto num4 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-4));
+  auto num5 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-5));
+  auto num6 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-6));
+
+  EXPECT_EQ(-1, cudf::convert_fixed_to_floating<double>(num0));
+  EXPECT_EQ(-1.2, cudf::convert_fixed_to_floating<double>(num1));
+  EXPECT_EQ(-1.23, cudf::convert_fixed_to_floating<double>(num2));
+  EXPECT_EQ(-1.234, cudf::convert_fixed_to_floating<double>(num3));
+  EXPECT_EQ(-1.2345, cudf::convert_fixed_to_floating<double>(num4));
+  EXPECT_EQ(-1.23456, cudf::convert_fixed_to_floating<double>(num5));
+  EXPECT_EQ(-1.234567, cudf::convert_fixed_to_floating<double>(num6));
 }
 
 TYPED_TEST(FixedPointTestAllReps, PaddedDecimalXXConstruction)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
-  decimalXX a{1.1, scale_type{-1}};
-  decimalXX b{1.01, scale_type{-2}};
-  decimalXX c{1.001, scale_type{-3}};
-  decimalXX d{1.0001, scale_type{-4}};
-  decimalXX e{1.00001, scale_type{-5}};
-  decimalXX f{1.000001, scale_type{-6}};
-
-  decimalXX x{1.000123, scale_type{-8}};
-  decimalXX y{0.000123, scale_type{-8}};
-
-  EXPECT_EQ(1.1, static_cast<double>(a));
-  EXPECT_EQ(1.01, static_cast<double>(b));
-  EXPECT_EQ(1, static_cast<double>(c));  // intentional (inherited problem from floating point)
-  EXPECT_EQ(1.0001, static_cast<double>(d));
-  EXPECT_EQ(1.00001, static_cast<double>(e));
-  EXPECT_EQ(1, static_cast<double>(f));  // intentional (inherited problem from floating point)
-
-  EXPECT_TRUE(1.000123 - static_cast<double>(x) < std::numeric_limits<double>::epsilon());
-  EXPECT_EQ(0.000123, static_cast<double>(y));
+  auto a = cudf::convert_floating_to_fixed<decimalXX>(1.1, scale_type(-1));
+  auto b = cudf::convert_floating_to_fixed<decimalXX>(1.01, scale_type(-2));
+  auto c = cudf::convert_floating_to_fixed<decimalXX>(1.001, scale_type(-3));
+  auto d = cudf::convert_floating_to_fixed<decimalXX>(1.0001, scale_type(-4));
+  auto e = cudf::convert_floating_to_fixed<decimalXX>(1.00001, scale_type(-5));
+  auto f = cudf::convert_floating_to_fixed<decimalXX>(1.000001, scale_type(-6));
+  auto x = cudf::convert_floating_to_fixed<decimalXX>(1.000123, scale_type(-8));
+  auto y = cudf::convert_floating_to_fixed<decimalXX>(0.000123, scale_type(-8));
+
+  EXPECT_EQ(1.1, cudf::convert_fixed_to_floating<double>(a));
+  EXPECT_EQ(1.01, cudf::convert_fixed_to_floating<double>(b));
+  EXPECT_EQ(1,
+            cudf::convert_fixed_to_floating<double>(
+              c));  // intentional (inherited problem from floating point)
+  EXPECT_EQ(1.0001, cudf::convert_fixed_to_floating<double>(d));
+  EXPECT_EQ(1.00001, cudf::convert_fixed_to_floating<double>(e));
+  EXPECT_EQ(1,
+            cudf::convert_fixed_to_floating<double>(
+              f));  // intentional (inherited problem from floating point)
+
+  EXPECT_TRUE(1.000123 - cudf::convert_fixed_to_floating<double>(x) <
+              std::numeric_limits<double>::epsilon());
+  EXPECT_EQ(0.000123, cudf::convert_fixed_to_floating<double>(y));
 }
 
 TYPED_TEST(FixedPointTestAllReps, SimpleBinaryFPConstruction)
@@ -118,34 +123,34 @@ TYPED_TEST(FixedPointTestAllReps, SimpleBinaryFPConstruction)
   binary_fp num3{10, scale_type{3}};
   binary_fp num4{10, scale_type{4}};
 
-  binary_fp num5{1.24, scale_type{0}};
-  binary_fp num6{1.24, scale_type{-1}};
-  binary_fp num7{1.32, scale_type{-2}};
-  binary_fp num8{1.41, scale_type{-3}};
-  binary_fp num9{1.45, scale_type{-4}};
-
-  EXPECT_EQ(10, static_cast<double>(num0));
-  EXPECT_EQ(10, static_cast<double>(num1));
-  EXPECT_EQ(8, static_cast<double>(num2));
-  EXPECT_EQ(8, static_cast<double>(num3));
-  EXPECT_EQ(0, static_cast<double>(num4));
-
-  EXPECT_EQ(1, static_cast<double>(num5));
-  EXPECT_EQ(1, static_cast<double>(num6));
-  EXPECT_EQ(1.25, static_cast<double>(num7));
-  EXPECT_EQ(1.375, static_cast<double>(num8));
-  EXPECT_EQ(1.4375, static_cast<double>(num9));
+  auto num5 = cudf::convert_floating_to_fixed<binary_fp>(1.24, scale_type(0));
+  auto num6 = cudf::convert_floating_to_fixed<binary_fp>(1.24, scale_type(-1));
+  auto num7 = cudf::convert_floating_to_fixed<binary_fp>(1.32, scale_type(-2));
+  auto num8 = cudf::convert_floating_to_fixed<binary_fp>(1.41, scale_type(-3));
+  auto num9 = cudf::convert_floating_to_fixed<binary_fp>(1.45, scale_type(-4));
+
+  EXPECT_EQ(10, cudf::convert_fixed_to_floating<double>(num0));
+  EXPECT_EQ(10, cudf::convert_fixed_to_floating<double>(num1));
+  EXPECT_EQ(8, cudf::convert_fixed_to_floating<double>(num2));
+  EXPECT_EQ(8, cudf::convert_fixed_to_floating<double>(num3));
+  EXPECT_EQ(0, cudf::convert_fixed_to_floating<double>(num4));
+
+  EXPECT_EQ(1, cudf::convert_fixed_to_floating<double>(num5));
+  EXPECT_EQ(1, cudf::convert_fixed_to_floating<double>(num6));
+  EXPECT_EQ(1.25, cudf::convert_fixed_to_floating<double>(num7));
+  EXPECT_EQ(1.375, cudf::convert_fixed_to_floating<double>(num8));
+  EXPECT_EQ(1.4375, cudf::convert_fixed_to_floating<double>(num9));
 }
 
 TYPED_TEST(FixedPointTestAllReps, MoreSimpleBinaryFPConstruction)
 {
   using binary_fp = fixed_point<TypeParam, Radix::BASE_2>;
 
-  binary_fp num0{1.25, scale_type{-2}};
-  binary_fp num1{2.1, scale_type{-4}};
+  auto num0 = cudf::convert_floating_to_fixed<binary_fp>(1.25, scale_type(-2));
+  auto num1 = cudf::convert_floating_to_fixed<binary_fp>(2.1, scale_type(-4));
 
-  EXPECT_EQ(1.25, static_cast<double>(num0));
-  EXPECT_EQ(2.0625, static_cast<double>(num1));
+  EXPECT_EQ(1.25, cudf::convert_fixed_to_floating<double>(num0));
+  EXPECT_EQ(2.0625, cudf::convert_fixed_to_floating<double>(num1));
 }
 
 TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXMath)
@@ -166,7 +171,7 @@ TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXMath)
   EXPECT_EQ(TWO / ONE, TWO);
   EXPECT_EQ(SIX / TWO, THREE);
 
-  decimalXX a{1.23, scale_type{-2}};
+  auto a = cudf::convert_floating_to_fixed<decimalXX>(1.23, scale_type(-2));
   decimalXX b{0, scale_type{0}};
 
   EXPECT_EQ(a + b, a);
@@ -211,8 +216,8 @@ TYPED_TEST(FixedPointTestAllReps, DecimalXXTrickyDivision)
   EXPECT_EQ(SIXTY_1 / TEN_0, ONE_1);
   EXPECT_EQ(SIXTY_1 / TEN_1, SIX_0);
 
-  decimalXX A{34.56, scale_type{-2}};
-  decimalXX B{1.234, scale_type{-3}};
+  auto A = cudf::convert_floating_to_fixed<decimalXX>(34.56, scale_type(-2));
+  auto B = cudf::convert_floating_to_fixed<decimalXX>(1.234, scale_type(-3));
   decimalXX C{1, scale_type{-2}};
 
   EXPECT_EQ(static_cast<int32_t>(A / B), 20);
@@ -255,17 +260,17 @@ TYPED_TEST(FixedPointTestAllReps, ArithmeticWithDifferentScales)
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
   decimalXX a{1, scale_type{0}};
-  decimalXX b{1.2, scale_type{-1}};
-  decimalXX c{1.23, scale_type{-2}};
-  decimalXX d{1.111, scale_type{-3}};
+  auto b = cudf::convert_floating_to_fixed<decimalXX>(1.2, scale_type(-1));
+  auto c = cudf::convert_floating_to_fixed<decimalXX>(1.23, scale_type(-2));
+  auto d = cudf::convert_floating_to_fixed<decimalXX>(1.111, scale_type(-3));
 
-  decimalXX x{2.2, scale_type{-1}};
-  decimalXX y{3.43, scale_type{-2}};
-  decimalXX z{4.541, scale_type{-3}};
+  auto x = cudf::convert_floating_to_fixed<decimalXX>(2.2, scale_type(-1));
+  auto y = cudf::convert_floating_to_fixed<decimalXX>(3.43, scale_type(-2));
+  auto z = cudf::convert_floating_to_fixed<decimalXX>(4.541, scale_type(-3));
 
-  decimalXX xx{0.2, scale_type{-1}};
-  decimalXX yy{0.03, scale_type{-2}};
-  decimalXX zz{0.119, scale_type{-3}};
+  auto xx = cudf::convert_floating_to_fixed<decimalXX>(0.2, scale_type(-1));
+  auto yy = cudf::convert_floating_to_fixed<decimalXX>(0.03, scale_type(-2));
+  auto zz = cudf::convert_floating_to_fixed<decimalXX>(0.119, scale_type(-3));
 
   EXPECT_EQ(a + b, x);
   EXPECT_EQ(a + b + c, y);
@@ -280,12 +285,12 @@ TYPED_TEST(FixedPointTestAllReps, RescaledTest)
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
   decimalXX num0{1, scale_type{0}};
-  decimalXX num1{1.2, scale_type{-1}};
-  decimalXX num2{1.23, scale_type{-2}};
-  decimalXX num3{1.234, scale_type{-3}};
-  decimalXX num4{1.2345, scale_type{-4}};
-  decimalXX num5{1.23456, scale_type{-5}};
-  decimalXX num6{1.234567, scale_type{-6}};
+  auto num1 = cudf::convert_floating_to_fixed<decimalXX>(1.2, scale_type(-1));
+  auto num2 = cudf::convert_floating_to_fixed<decimalXX>(1.23, scale_type(-2));
+  auto num3 = cudf::convert_floating_to_fixed<decimalXX>(1.234, scale_type(-3));
+  auto num4 = cudf::convert_floating_to_fixed<decimalXX>(1.2345, scale_type(-4));
+  auto num5 = cudf::convert_floating_to_fixed<decimalXX>(1.23456, scale_type(-5));
+  auto num6 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-6));
 
   EXPECT_EQ(num0, num6.rescaled(scale_type{0}));
   EXPECT_EQ(num1, num6.rescaled(scale_type{-1}));
@@ -314,7 +319,7 @@ TYPED_TEST(FixedPointTestAllReps, BoolConversion)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
-  decimalXX truthy_value{1.234567, scale_type{0}};
+  auto truthy_value = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(0));
   decimalXX falsy_value{0, scale_type{0}};
 
   // Test explicit conversions
@@ -442,12 +447,14 @@ void float_vector_test(ValueType const initial_value,
   std::vector<decimal32> vec1(size);
   std::vector<ValueType> vec2(size);
 
-  std::iota(std::begin(vec1), std::end(vec1), decimal32{initial_value, scale_type{scale}});
+  auto decimal_input = cudf::convert_floating_to_fixed<decimal32>(initial_value, scale_type{scale});
+  std::iota(std::begin(vec1), std::end(vec1), decimal_input);
   std::iota(std::begin(vec2), std::end(vec2), initial_value);
 
   auto equal = std::equal(
     std::cbegin(vec1), std::cend(vec1), std::cbegin(vec2), [](auto const& a, auto const& b) {
-      return static_cast<double>(a) - b <= std::numeric_limits<ValueType>::epsilon();
+      return cudf::convert_fixed_to_floating<double>(a) - b <=
+             std::numeric_limits<ValueType>::epsilon();
     });
 
   EXPECT_TRUE(equal);
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index e108e68e1f9..a544a812efb 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -548,7 +548,7 @@ TEST_F(OrcWriterTest, SlicedTable)
   int32_col col0(seq_col0.begin(), seq_col0.end());
   str_col col1(strings.begin(), strings.end());
   float32_col col2(seq_col2.begin(), seq_col2.end());
-  float32_col col3(seq_col3, seq_col3 + num_rows);
+  dec64_col col3(seq_col3, seq_col3 + num_rows);
 
   list_col<int64_t> col4{
     {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}};

From af33b0aba4dafe82cb5d25811e5e737af6c7faad Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 11 Apr 2024 16:13:09 -0400
Subject: [PATCH 060/272] nanoarrow uses package override for proper pinned
 versions generation (#15515)

The usage of `PATCH_COMMAND` with `rapids_cpm_find` isn't capturable by `+rapids_cpm_generate_pinned_versions`. So we use a nanoarrow json override file to hold the patch we need applied and the custom SHA1 to check out.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15515
---
 cpp/cmake/thirdparty/get_nanoarrow.cmake      | 36 ++++---------------
 .../patches/nanoarrow_override.json           | 18 ++++++++++
 2 files changed, 24 insertions(+), 30 deletions(-)
 create mode 100644 cpp/cmake/thirdparty/patches/nanoarrow_override.json

diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake
index 884e5a2f368..dc0b8d09746 100644
--- a/cpp/cmake/thirdparty/get_nanoarrow.cmake
+++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake
@@ -14,44 +14,20 @@
 
 # This function finds nanoarrow and sets any additional necessary environment variables.
 function(find_and_configure_nanoarrow)
-  set(oneValueArgs VERSION FORK PINNED_TAG)
-  cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  include(${rapids-cmake-dir}/cpm/package_override.cmake)
 
-  # Only run if PKG_VERSION is < 0.5.0
-  if(PKG_VERSION VERSION_LESS 0.5.0)
-    set(patch_files_to_run "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches/nanoarrow_cmake.diff")
-    set(patch_issues_to_ref
-        "Fix issues with nanoarrow CMake [https://github.com/apache/arrow-nanoarrow/pull/406]"
-    )
-    set(patch_script "${CMAKE_BINARY_DIR}/rapids-cmake/patches/nanoarrow/patch.cmake")
-    set(log_file "${CMAKE_BINARY_DIR}/rapids-cmake/patches/nanoarrow/log")
-    string(TIMESTAMP current_year "%Y" UTC)
-    configure_file(
-      ${rapids-cmake-dir}/cpm/patches/command_template.cmake.in "${patch_script}" @ONLY
-    )
-  else()
-    message(
-      FATAL_ERROR
-        "Nanoarrow version ${PKG_VERSION} already contains the necessary patch. Please remove this patch from cudf."
-    )
-  endif()
+  set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
+  rapids_cpm_package_override("${cudf_patch_dir}/nanoarrow_override.json")
 
+  # The git_repo and git_tag are provided by the nanoarrow_override file
   rapids_cpm_find(
-    nanoarrow ${PKG_VERSION}
+    nanoarrow 0.4.0
     GLOBAL_TARGETS nanoarrow
     CPM_ARGS
-    GIT_REPOSITORY https://github.com/${PKG_FORK}/arrow-nanoarrow.git
-    GIT_TAG ${PKG_PINNED_TAG}
-    # TODO: Commit hashes are not supported with shallow clones. Can switch this if and when we pin
-    # to an actual tag.
-    GIT_SHALLOW FALSE
-    PATCH_COMMAND ${CMAKE_COMMAND} -P ${patch_script}
     OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf"
   )
   set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)
   rapids_export_find_package_root(BUILD nanoarrow "${nanoarrow_BINARY_DIR}" EXPORT_SET cudf-exports)
 endfunction()
 
-find_and_configure_nanoarrow(
-  VERSION 0.4.0 FORK apache PINNED_TAG c97720003ff863b81805bcdb9f7c91306ab6b6a8
-)
+find_and_configure_nanoarrow()
diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_override.json b/cpp/cmake/thirdparty/patches/nanoarrow_override.json
new file mode 100644
index 00000000000..0b83d1808cb
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/nanoarrow_override.json
@@ -0,0 +1,18 @@
+
+{
+  "packages" : {
+    "nanoarrow" : {
+      "version" : "0.4.0",
+      "git_url" : "https://github.com/apache/arrow-nanoarrow.git",
+      "git_tag" : "c97720003ff863b81805bcdb9f7c91306ab6b6a8",
+      "git_shallow" : false,
+      "patches" : [
+        {
+          "file" : "${current_json_dir}/nanoarrow_cmake.diff",
+          "issue" : "Fix add support for global setup to initialize RMM in nvbench [https://github.com/NVIDIA/nvbench/pull/123]",
+          "fixed_in" : "0.5.0"
+        }
+      ]
+    }
+  }
+}

From 8506ea6dd12cd1bde91550366d846737bc7fdb7c Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Thu, 11 Apr 2024 18:07:22 -0500
Subject: [PATCH 061/272] Migrate string `case` operations to `pylibcudf`
 (#15489)

This PR creates `pylibcudf` `case` APIs and migrates the cuDF cython to leverage them. Part of https://github.com/rapidsai/cudf/issues/15162.

Authors:
  - https://github.com/brandon-b-miller
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15489
---
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |  2 +
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd  |  2 +
 python/cudf/cudf/_lib/pylibcudf/__init__.py   |  2 +
 .../_lib/pylibcudf/strings/CMakeLists.txt     | 21 ++++++++
 .../cudf/_lib/pylibcudf/strings/__init__.pxd  |  3 ++
 .../cudf/_lib/pylibcudf/strings/__init__.py   |  3 ++
 .../cudf/cudf/_lib/pylibcudf/strings/case.pxd |  8 +++
 .../cudf/cudf/_lib/pylibcudf/strings/case.pyx | 30 +++++++++++
 python/cudf/cudf/_lib/strings/case.pyx        | 50 +++++++------------
 .../cudf/pylibcudf_tests/test_string_case.py  | 35 +++++++++++++
 10 files changed, 124 insertions(+), 32 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/case.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/case.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_string_case.py

diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index 81d15cf95b4..c2b7cb7ca3d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -44,3 +44,5 @@ rapids_cython_create_modules(
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
 )
 link_to_pyarrow_headers(pylibcudf_interop)
+
+add_subdirectory(strings)
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index 48c23a9dd4c..5adefa5fd93 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -17,6 +17,7 @@ from . cimport (
     search,
     sorting,
     stream_compaction,
+    strings,
     types,
     unary,
 )
@@ -48,6 +49,7 @@ __all__ = [
     "rolling",
     "search",
     "stream_compaction",
+    "strings",
     "sorting",
     "types",
     "unary",
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index 8ccb0ecc341..89f874f5fa5 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -17,6 +17,7 @@
     search,
     sorting,
     stream_compaction,
+    strings,
     types,
     unary,
 )
@@ -48,6 +49,7 @@
     "rolling",
     "search",
     "stream_compaction",
+    "strings",
     "sorting",
     "types",
     "unary",
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
new file mode 100644
index 00000000000..3a2a9e1e7eb
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
@@ -0,0 +1,21 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources case.pyx)
+set(linked_libraries cudf::cudf)
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
new file mode 100644
index 00000000000..ff87549b5b5
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
@@ -0,0 +1,3 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from . import case
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
new file mode 100644
index 00000000000..ff87549b5b5
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from . import case
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd
new file mode 100644
index 00000000000..225d566fe06
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib.pylibcudf.column cimport Column
+
+
+cpdef Column to_lower(Column input)
+cpdef Column to_upper(Column input)
+cpdef Column swapcase(Column input)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx
new file mode 100644
index 00000000000..69910fd8c50
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx
@@ -0,0 +1,30 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.strings cimport case as cpp_case
+from cudf._lib.pylibcudf.column cimport Column
+
+
+cpdef Column to_lower(Column input):
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_case.to_lower(input.view())
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column to_upper(Column input):
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_case.to_upper(input.view())
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column swapcase(Column input):
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_case.swapcase(input.view())
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/case.pyx b/python/cudf/cudf/_lib/strings/case.pyx
index 09af1178946..38f242a67d6 100644
--- a/python/cudf/cudf/_lib/strings/case.pyx
+++ b/python/cudf/cudf/_lib/strings/case.pyx
@@ -1,48 +1,34 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.case cimport (
-    swapcase as cpp_swapcase,
-    to_lower as cpp_to_lower,
-    to_upper as cpp_to_upper,
-)
+
+from cudf._lib.pylibcudf.strings import case
 
 
 @acquire_spill_lock()
 def to_upper(Column source_strings):
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_to_upper(source_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+            case.to_upper(
+                source_strings.to_pylibcudf(mode='read')
+            )
+    )
 
 
 @acquire_spill_lock()
 def to_lower(Column source_strings):
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_to_lower(source_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+            case.to_lower(
+                source_strings.to_pylibcudf(mode='read')
+            )
+    )
 
 
 @acquire_spill_lock()
 def swapcase(Column source_strings):
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_swapcase(source_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+            case.swapcase(
+                source_strings.to_pylibcudf(mode='read')
+            )
+    )
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_case.py b/python/cudf/cudf/pylibcudf_tests/test_string_case.py
new file mode 100644
index 00000000000..ae01d953df5
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_string_case.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def string_col():
+    return pa.array(
+        ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"]
+    )
+
+
+def test_to_upper(string_col):
+    plc_col = plc.interop.from_arrow(string_col)
+    got = plc.strings.case.to_upper(plc_col)
+    expected = pa.compute.utf8_upper(string_col)
+    assert_column_eq(got, expected)
+
+
+def test_to_lower(string_col):
+    plc_col = plc.interop.from_arrow(string_col)
+    got = plc.strings.case.to_lower(plc_col)
+    expected = pa.compute.utf8_lower(string_col)
+    assert_column_eq(got, expected)
+
+
+def test_swapcase(string_col):
+    plc_col = plc.interop.from_arrow(string_col)
+    got = plc.strings.case.swapcase(plc_col)
+    expected = pa.compute.utf8_swapcase(string_col)
+    assert_column_eq(got, expected)

From ff22a7ac0d565be2b2221c6080966eb0338676ee Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Thu, 11 Apr 2024 21:01:40 -0400
Subject: [PATCH 062/272] Fix and clarify notes on result ordering (#13255)

I noticed when answering #13254 that the code example in this section of our documentation was incorrect and the text itself could use some improving.

Authors:
  - Ashwin Srinath (https://github.com/shwina)
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/13255
---
 .../source/user_guide/pandas-comparison.md    | 27 +++++++++++++------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/docs/cudf/source/user_guide/pandas-comparison.md b/docs/cudf/source/user_guide/pandas-comparison.md
index 549d91b771a..4aaaa8a93df 100644
--- a/docs/cudf/source/user_guide/pandas-comparison.md
+++ b/docs/cudf/source/user_guide/pandas-comparison.md
@@ -87,9 +87,17 @@ using `.from_arrow()` or `.from_pandas()`.
 
 ## Result ordering
 
-By default, `join` (or `merge`), `value_counts` and `groupby` operations in cuDF
-do *not* guarantee output ordering.
-Compare the results obtained from Pandas and cuDF below:
+In Pandas, `join` (or `merge`), `value_counts` and `groupby` operations provide
+certain guarantees about the order of rows in the result returned.  In a Pandas
+`join`, the order of join keys is (depending on the particular style of join
+being performed) either preserved or sorted lexicographically by default.
+`groupby` sorts the group keys, and preserves the order of rows within each
+group. In some cases, disabling this option in Pandas can yield better
+performance.
+
+By contrast, cuDF's default behavior is to return rows in a
+non-deterministic order to maximize performance.  Compare the results
+obtained from Pandas and cuDF below:
 
 ```{code} python
 >>> import cupy as cp
@@ -114,13 +122,16 @@ a
 4  342.000000
 ```
 
-To match Pandas behavior, you must explicitly pass `sort=True`
-or enable the `mode.pandas_compatible` option when trying to
-match Pandas behavior with `sort=False`:
+In most cases, the rows of a DataFrame are accessed by index labels
+rather than by position, so the order in which rows are returned
+doesn't matter. However, if you require that results be returned in a
+predictable (sorted) order, you can pass the `sort=True` option
+explicitly or enable the `mode.pandas_compatible` option when trying
+to match Pandas behavior with `sort=False`:
 
 ```{code} python
->>> df.to_pandas().groupby("a", sort=True).mean().head()
-            b
+>>> df.groupby("a", sort=True).mean().head()
+         b
 a
 0   70.000000
 1  356.333333

From f19d4eb9f2ccbe1833aa8112c053e622bc138301 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Thu, 11 Apr 2024 23:07:23 -0500
Subject: [PATCH 063/272] Fix async synchronization issues in json_column.cu
 (#15497)

Fixes #15390
This change fixes async synchronization issues in json_column.cu.
Related file json_tree.cu does not have async synchronization issues.

Summary of changes:
changed debug print async to sync,
added synchronize after multiple async calls
changed h_chars to async since subsequent call is sync (it will also help because chars array is usually large).
changed is_str_column_all_nulls to sync.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15497
---
 cpp/src/io/json/json_column.cu | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index bc5c45d8980..9d40c657396 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -76,16 +76,16 @@ void print_tree(host_span<SymbolT const> input,
                 tree_meta_t const& d_gpu_tree,
                 rmm::cuda_stream_view stream)
 {
-  print_vec(cudf::detail::make_std_vector_async(d_gpu_tree.node_categories, stream),
+  print_vec(cudf::detail::make_std_vector_sync(d_gpu_tree.node_categories, stream),
             "node_categories",
             to_cat);
-  print_vec(cudf::detail::make_std_vector_async(d_gpu_tree.parent_node_ids, stream),
+  print_vec(cudf::detail::make_std_vector_sync(d_gpu_tree.parent_node_ids, stream),
             "parent_node_ids",
             to_int);
   print_vec(
-    cudf::detail::make_std_vector_async(d_gpu_tree.node_levels, stream), "node_levels", to_int);
-  auto node_range_begin = cudf::detail::make_std_vector_async(d_gpu_tree.node_range_begin, stream);
-  auto node_range_end   = cudf::detail::make_std_vector_async(d_gpu_tree.node_range_end, stream);
+    cudf::detail::make_std_vector_sync(d_gpu_tree.node_levels, stream), "node_levels", to_int);
+  auto node_range_begin = cudf::detail::make_std_vector_sync(d_gpu_tree.node_range_begin, stream);
+  auto node_range_end   = cudf::detail::make_std_vector_sync(d_gpu_tree.node_range_end, stream);
   print_vec(node_range_begin, "node_range_begin", to_int);
   print_vec(node_range_end, "node_range_end", to_int);
   for (int i = 0; i < int(node_range_begin.size()); i++) {
@@ -333,10 +333,11 @@ rmm::device_uvector<NodeIndexT> get_values_column_indices(TreeDepthT const row_a
  * @param stream CUDA stream
  * @return Vector of strings
  */
-std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
-                                              device_span<SymbolOffsetT const> node_range_begin,
-                                              device_span<SymbolOffsetT const> node_range_end,
-                                              rmm::cuda_stream_view stream)
+std::vector<std::string> copy_strings_to_host_sync(
+  device_span<SymbolT const> input,
+  device_span<SymbolOffsetT const> node_range_begin,
+  device_span<SymbolOffsetT const> node_range_end,
+  rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   auto const num_strings = node_range_begin.size();
@@ -371,12 +372,13 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
   auto to_host        = [stream](auto const& col) {
     if (col.is_empty()) return std::vector<std::string>{};
     auto const scv     = cudf::strings_column_view(col);
-    auto const h_chars = cudf::detail::make_std_vector_sync<char>(
+    auto const h_chars = cudf::detail::make_std_vector_async<char>(
       cudf::device_span<char const>(scv.chars_begin(stream), scv.chars_size(stream)), stream);
-    auto const h_offsets = cudf::detail::make_std_vector_sync(
+    auto const h_offsets = cudf::detail::make_std_vector_async(
       cudf::device_span<cudf::size_type const>(scv.offsets().data<cudf::size_type>() + scv.offset(),
                                                scv.size() + 1),
       stream);
+    stream.synchronize();
 
     // build std::string vector from chars and offsets
     std::vector<std::string> host_data;
@@ -528,8 +530,9 @@ void make_device_json_column(device_span<SymbolT const> input,
   auto column_range_beg =
     cudf::detail::make_std_vector_async(d_column_tree.node_range_begin, stream);
   auto max_row_offsets = cudf::detail::make_std_vector_async(d_max_row_offsets, stream);
-  std::vector<std::string> column_names = copy_strings_to_host(
+  std::vector<std::string> column_names = copy_strings_to_host_sync(
     input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream);
+  stream.synchronize();
   // array of arrays column names
   if (is_array_of_arrays) {
     TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2;
@@ -537,6 +540,7 @@ void make_device_json_column(device_span<SymbolT const> input,
       get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream);
     auto h_values_column_indices =
       cudf::detail::make_std_vector_async(values_column_indices, stream);
+    stream.synchronize();
     std::transform(unique_col_ids.begin(),
                    unique_col_ids.end(),
                    column_names.begin(),
@@ -609,7 +613,7 @@ void make_device_json_column(device_span<SymbolT const> input,
 
   std::vector<uint8_t> is_str_column_all_nulls{};
   if (is_enabled_mixed_types_as_string) {
-    is_str_column_all_nulls = cudf::detail::make_std_vector_async(
+    is_str_column_all_nulls = cudf::detail::make_std_vector_sync(
       is_all_nulls_each_column(input, d_column_tree, tree, col_ids, options, stream), stream);
   }
 

From 6f8ff799bfc9e921bcde97c46cf3454c6ae45c6d Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 12 Apr 2024 11:40:18 -0500
Subject: [PATCH 064/272] Move to pandas-tests to a dedicated workflow file and
 trigger it from branch.yaml (#15516)

This PR moves pandas-tests to a dedicated workflow file and trigger's it from `branch.yaml`

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Ray Douglass (https://github.com/raydouglass)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15516
---
 .github/workflows/build.yaml                | 18 ++++++++++++++
 .github/workflows/pandas-tests.yaml         | 27 +++++++++++++++++++++
 .github/workflows/pr.yaml                   |  2 +-
 .github/workflows/test.yaml                 | 11 ---------
 ci/cudf_pandas_scripts/pandas-tests/diff.sh |  8 +++---
 ci/cudf_pandas_scripts/pandas-tests/run.sh  | 11 +++++----
 6 files changed, 57 insertions(+), 20 deletions(-)
 create mode 100644 .github/workflows/pandas-tests.yaml

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 67c451fbd6e..6942ef0009d 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -108,3 +108,21 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: dask_cudf
+  trigger-pandas-tests:
+    if: inputs.build_type == 'nightly'
+    needs: wheel-build-cudf
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code repo
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.sha }}
+          persist-credentials: false
+      - name: Trigger pandas-tests
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          gh workflow run pandas-tests.yaml \
+            -f branch=${{ inputs.branch }} \
+            -f sha=${{ inputs.sha }} \
+            -f date=${{ inputs.date }}
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
new file mode 100644
index 00000000000..60544294809
--- /dev/null
+++ b/.github/workflows/pandas-tests.yaml
@@ -0,0 +1,27 @@
+name: Pandas Test Job
+
+on:
+  workflow_dispatch:
+    inputs:
+      branch:
+        required: true
+        type: string
+      date:
+        required: true
+        type: string
+      sha:
+        required: true
+        type: string
+
+jobs:
+  pandas-tests:
+      # run the Pandas unit tests
+      secrets: inherit
+      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
+      with:
+        matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" ))
+        build_type: nightly
+        branch: ${{ inputs.branch }}
+        date: ${{ inputs.date }}
+        sha: ${{ inputs.sha }}
+        script: ci/cudf_pandas_scripts/pandas-tests/run.sh main
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 345ccbea45b..f84b1f42928 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -174,7 +174,7 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" ))
       build_type: pull-request
       script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr
       # Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 65aef37697e..c5ae2f3b5a8 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -125,14 +125,3 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       script: ci/cudf_pandas_scripts/run_tests.sh
-  pandas-tests:
-    # run the Pandas unit tests
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
-    with:
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(min_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-      build_type: nightly
-      branch: ${{ inputs.branch }}
-      date: ${{ inputs.date }}
-      sha: ${{ inputs.sha }}
-      script: ci/cudf_pandas_scripts/pandas-tests/run.sh main
diff --git a/ci/cudf_pandas_scripts/pandas-tests/diff.sh b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
index cf80f383db4..f87a3a36fcc 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/diff.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
@@ -8,14 +8,16 @@
 
 # Hard-coded needs to match the version deduced by rapids-upload-artifacts-dir
 GH_JOB_NAME="pandas-tests-diff / build"
+RAPIDS_FULL_VERSION=$(<./VERSION)
 rapids-logger "Github job name: ${GH_JOB_NAME}"
+rapids-logger "Rapids version: ${RAPIDS_FULL_VERSION}"
 
 PY_VER="39"
-MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-results.json
-PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-results.json
+MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json
+PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-${RAPIDS_FULL_VERSION}-results.json
 
 rapids-logger "Fetching latest available results from nightly"
-aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt
+aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt
 
 read -r COMPARE_ENV < s3_output.txt
 export COMPARE_ENV
diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh
index 1f70ca78c41..d13d31ad09f 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/run.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh
@@ -6,8 +6,8 @@
 set -euo pipefail
 
 PANDAS_TESTS_BRANCH=${1}
-
-rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch"
+RAPIDS_FULL_VERSION=$(<./VERSION)
+rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch and rapids-version $RAPIDS_FULL_VERSION"
 rapids-logger "PR number: ${RAPIDS_REF_NAME:-"unknown"}"
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
@@ -27,9 +27,10 @@ bash python/cudf/cudf/pandas/scripts/run-pandas-tests.sh \
   --dist worksteal \
   --report-log=${PANDAS_TESTS_BRANCH}.json 2>&1
 
+SUMMARY_FILE_NAME=${PANDAS_TESTS_BRANCH}-24.06-results.json
 # summarize the results and save them to artifacts:
-python python/cudf/cudf/pandas/scripts/summarize-test-results.py --output json pandas-testing/${PANDAS_TESTS_BRANCH}.json > pandas-testing/${PANDAS_TESTS_BRANCH}-results.json
+python python/cudf/cudf/pandas/scripts/summarize-test-results.py --output json pandas-testing/${PANDAS_TESTS_BRANCH}.json > pandas-testing/${SUMMARY_FILE_NAME}
 RAPIDS_ARTIFACTS_DIR=${RAPIDS_ARTIFACTS_DIR:-"${PWD}/artifacts"}
 mkdir -p "${RAPIDS_ARTIFACTS_DIR}"
-mv pandas-testing/${PANDAS_TESTS_BRANCH}-results.json ${RAPIDS_ARTIFACTS_DIR}/
-rapids-upload-to-s3 ${RAPIDS_ARTIFACTS_DIR}/${PANDAS_TESTS_BRANCH}-results.json "${RAPIDS_ARTIFACTS_DIR}"
+mv pandas-testing/${SUMMARY_FILE_NAME} ${RAPIDS_ARTIFACTS_DIR}/
+rapids-upload-to-s3 ${RAPIDS_ARTIFACTS_DIR}/${SUMMARY_FILE_NAME} "${RAPIDS_ARTIFACTS_DIR}"

From 2e00cb1ebd7bee4a4085d1e691ad3b626bc10d0e Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 12 Apr 2024 18:06:27 +0100
Subject: [PATCH 065/272] cudf.pandas: Series dt accessor is
 CombinedDatetimelikeProperties (#15523)

On the pandas Series type (not an instance) the dt attribute returns a CombinedDatetimelikeProperties object, which advertises the attributes of all possible datetime like dtypes. Previously we were proxying this with a DatatimeProperties object, which doesn't advertise as many properties. To allow wrapping libraries like dask that introspect the object to work correctly, advertise like pandas on the type. The instance still produces an object of the correct type due to dynamic lookup and/or metaclass magic in cudf.pandas and pandas respectively.

- Closes #15522

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15523
---
 python/cudf/cudf/pandas/_wrappers/pandas.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index b7c8e92e8db..3c82d571939 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -174,7 +174,7 @@ def _DataFrame__dir__(self):
         "__arrow_array__": arrow_array_method,
         "__cuda_array_interface__": cuda_array_interface,
         "__iter__": custom_iter,
-        "dt": _AccessorAttr(DatetimeProperties),
+        "dt": _AccessorAttr(CombinedDatetimelikeProperties),
         "str": _AccessorAttr(StringMethods),
         "cat": _AccessorAttr(_CategoricalAccessor),
         "_constructor": _FastSlowAttribute("_constructor"),
@@ -208,7 +208,7 @@ def Index__new__(cls, *args, **kwargs):
         "__array_function__": array_function_method,
         "__arrow_array__": arrow_array_method,
         "__cuda_array_interface__": cuda_array_interface,
-        "dt": _AccessorAttr(DatetimeProperties),
+        "dt": _AccessorAttr(CombinedDatetimelikeProperties),
         "str": _AccessorAttr(StringMethods),
         "cat": _AccessorAttr(_CategoricalAccessor),
         "__iter__": custom_iter,

From f5df665da989b88853381bfb776224d17b38ce47 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 12 Apr 2024 17:44:37 -0400
Subject: [PATCH 066/272] Performance improvement in libcudf case conversion
 for long strings (#15441)

Improves logic efficiency overall strings case conversion and reworks the specialized kernels for long strings to improve parallelization within each string.
Closes #15406

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15441
---
 cpp/src/strings/case.cu          | 232 ++++++++++++++++++++++---------
 cpp/tests/strings/case_tests.cpp |   7 +-
 2 files changed, 168 insertions(+), 71 deletions(-)

diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 8d8930013cf..a7fd244f8a5 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -19,6 +19,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/case.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
@@ -34,6 +35,9 @@
 
 #include <cuda/atomic>
 #include <cuda/functional>
+#include <thrust/for_each.h>
+#include <thrust/merge.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace strings {
@@ -110,23 +114,22 @@ struct convert_char_fn {
  *
  * This can be used in calls to make_strings_children.
  */
-struct upper_lower_fn {
+struct base_upper_lower_fn {
   convert_char_fn converter;
-  column_device_view d_strings;
   size_type* d_offsets{};
   char* d_chars{};
 
-  __device__ void operator()(size_type idx) const
+  base_upper_lower_fn(convert_char_fn converter) : converter(converter) {}
+
+  __device__ inline void process_string(string_view d_str, size_type idx) const
   {
-    if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
-      return;
-    }
-    auto const d_str = d_strings.element<string_view>(idx);
-    size_type bytes  = 0;
-    char* d_buffer   = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    for (auto itr = d_str.begin(); itr != d_str.end(); ++itr) {
-      auto const size = converter.process_character(*itr, d_buffer);
+    size_type bytes = 0;
+    char* d_buffer  = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    for (auto itr = d_str.data(); itr < (d_str.data() + d_str.size_bytes()); ++itr) {
+      if (is_utf8_continuation_char(static_cast<u_char>(*itr))) continue;
+      char_utf8 chr = 0;
+      to_char_utf8(itr, chr);
+      auto const size = converter.process_character(chr, d_buffer);
       if (d_buffer) {
         d_buffer += size;
       } else {
@@ -137,45 +140,116 @@ struct upper_lower_fn {
   }
 };
 
+struct upper_lower_fn : public base_upper_lower_fn {
+  column_device_view d_strings;
+
+  upper_lower_fn(convert_char_fn converter, column_device_view const& d_strings)
+    : base_upper_lower_fn{converter}, d_strings{d_strings}
+  {
+  }
+
+  __device__ void operator()(size_type idx) const
+  {
+    if (d_strings.is_null(idx)) {
+      if (!d_chars) { d_offsets[idx] = 0; }
+      return;
+    }
+    auto const d_str = d_strings.element<string_view>(idx);
+    process_string(d_str, idx);
+  }
+};
+
+// Long strings are divided into smaller strings using this value as a guide.
+// Generally strings are split into sub-blocks of bytes of this size but
+// care is taken to not sub-block in the middle of a multi-byte character.
+constexpr size_type LS_SUB_BLOCK_SIZE = 32;
+
 /**
- * @brief Count output bytes in warp-parallel threads
+ * @brief Produces sub-offsets for the chars in the given strings column
+ */
+struct sub_offset_fn {
+  char const* d_input_chars;
+  int64_t first_offset;
+  int64_t last_offset;
+
+  __device__ int64_t operator()(int64_t idx) const
+  {
+    auto const end = d_input_chars + last_offset;
+    auto position  = (idx + 1) * LS_SUB_BLOCK_SIZE;
+    auto begin     = d_input_chars + first_offset + position;
+    while ((begin < end) && is_utf8_continuation_char(static_cast<u_char>(*begin))) {
+      ++begin;
+      ++position;
+    }
+    return (begin < end) ? position + first_offset : last_offset;
+  }
+};
+
+/**
+ * @brief Specialized case conversion for long strings
  *
- * This executes as one warp per string and just computes the output sizes.
+ * This is needed since the offset count can exceed size_type.
+ * Also, nulls are ignored since this purely builds the output chars.
+ * The d_offsets are only temporary to help address the sub-blocks.
  */
-struct count_bytes_fn {
+struct upper_lower_ls_fn : public base_upper_lower_fn {
   convert_char_fn converter;
-  column_device_view d_strings;
-  size_type* d_offsets;
+  char const* d_input_chars;
+  int64_t* d_input_offsets;  // includes column offset
 
+  upper_lower_ls_fn(convert_char_fn converter, char const* d_input_chars, int64_t* d_input_offsets)
+    : base_upper_lower_fn{converter}, d_input_chars{d_input_chars}, d_input_offsets{d_input_offsets}
+  {
+  }
+
+  // idx is row index
   __device__ void operator()(size_type idx) const
   {
-    auto const str_idx  = idx / cudf::detail::warp_size;
-    auto const lane_idx = idx % cudf::detail::warp_size;
-
-    // initialize the output for the atomicAdd
-    if (lane_idx == 0) { d_offsets[str_idx] = 0; }
-    __syncwarp();
-
-    if (d_strings.is_null(str_idx)) { return; }
-    auto const d_str   = d_strings.element<string_view>(str_idx);
-    auto const str_ptr = d_str.data();
-
-    size_type size = 0;
-    for (auto i = lane_idx; i < d_str.size_bytes(); i += cudf::detail::warp_size) {
-      auto const chr = str_ptr[i];
-      if (is_utf8_continuation_char(chr)) { continue; }
-      char_utf8 u8 = 0;
-      to_char_utf8(str_ptr + i, u8);
-      size += converter.process_character(u8);
-    }
-    // this is every so slightly faster than using the cub::warp_reduce
-    if (size > 0) {
-      cuda::atomic_ref<size_type, cuda::thread_scope_block> ref{*(d_offsets + str_idx)};
-      ref.fetch_add(size, cuda::std::memory_order_relaxed);
-    }
+    auto const offset = d_input_offsets[idx];
+    auto const d_str  = string_view{d_input_chars + offset,
+                                   static_cast<size_type>(d_input_offsets[idx + 1] - offset)};
+    process_string(d_str, idx);
   }
 };
 
+/**
+ * @brief Count output bytes in warp-parallel threads
+ *
+ * This executes as one warp per string and just computes the output sizes.
+ */
+CUDF_KERNEL void count_bytes_kernel(convert_char_fn converter,
+                                    column_device_view d_strings,
+                                    size_type* d_sizes)
+{
+  auto idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; }
+
+  auto const str_idx  = idx / cudf::detail::warp_size;
+  auto const lane_idx = idx % cudf::detail::warp_size;
+
+  // initialize the output for the atomicAdd
+  if (lane_idx == 0) { d_sizes[str_idx] = 0; }
+  __syncwarp();
+
+  if (d_strings.is_null(str_idx)) { return; }
+  auto const d_str   = d_strings.element<string_view>(str_idx);
+  auto const str_ptr = d_str.data();
+
+  size_type size = 0;
+  for (auto i = lane_idx; i < d_str.size_bytes(); i += cudf::detail::warp_size) {
+    auto const chr = str_ptr[i];
+    if (is_utf8_continuation_char(chr)) { continue; }
+    char_utf8 u8 = 0;
+    to_char_utf8(str_ptr + i, u8);
+    size += converter.process_character(u8);
+  }
+  // this is slightly faster than using the cub::warp_reduce
+  if (size > 0) {
+    cuda::atomic_ref<size_type, cuda::thread_scope_block> ref{*(d_sizes + str_idx)};
+    ref.fetch_add(size, cuda::std::memory_order_relaxed);
+  }
+}
+
 /**
  * @brief Special functor for processing ASCII-only data
  */
@@ -208,11 +282,18 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
   auto const d_cases   = get_character_cases_table();
   auto const d_special = get_special_case_mapping_table();
 
+  auto const first_offset = (input.offset() == 0) ? 0L
+                                                  : cudf::strings::detail::get_offset_value(
+                                                      input.offsets(), input.offset(), stream);
+  auto const last_offset =
+    cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream);
+  auto const chars_size = last_offset - first_offset;
+
   convert_char_fn ccfn{case_flag, d_flags, d_cases, d_special};
   upper_lower_fn converter{ccfn, *d_strings};
 
   // For smaller strings, use the regular string-parallel algorithm
-  if ((input.chars_size(stream) / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
+  if ((chars_size / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
     auto [offsets, chars] =
       cudf::strings::detail::make_strings_children(converter, input.size(), stream, mr);
     return make_strings_column(input.size(),
@@ -235,9 +316,8 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
                        [] __device__(auto chr) { return is_utf8_continuation_char(chr); })) > 0;
   if (!multi_byte_chars) {
     // optimization for ASCII-only case: copy the input column and inplace replace each character
-    auto result           = std::make_unique<column>(input.parent(), stream, mr);
-    auto d_chars          = result->mutable_view().head<char>();
-    auto const chars_size = strings_column_view(result->view()).chars_size(stream);
+    auto result  = std::make_unique<column>(input.parent(), stream, mr);
+    auto d_chars = result->mutable_view().head<char>();
     thrust::transform(
       rmm::exec_policy(stream), d_chars, d_chars + chars_size, d_chars, ascii_converter_fn{ccfn});
     result->set_null_count(input.null_count());
@@ -245,30 +325,46 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
   }
 
   // This will use a warp-parallel algorithm to compute the output sizes for each string
-  // and then uses the normal string parallel functor to build the output.
-  auto offsets = make_numeric_column(
-    data_type{type_to_id<size_type>()}, input.size() + 1, mask_state::UNALLOCATED, stream, mr);
-  auto d_offsets = offsets->mutable_view().data<size_type>();
-
-  // first pass, compute output sizes
   // note: tried to use segmented-reduce approach instead here and it was consistently slower
-  count_bytes_fn counter{ccfn, *d_strings, d_offsets};
-  auto const count_itr = thrust::make_counting_iterator<size_type>(0);
-  thrust::for_each_n(
-    rmm::exec_policy(stream), count_itr, input.size() * cudf::detail::warp_size, counter);
-
-  // convert sizes to offsets
-  auto const bytes =
-    cudf::detail::sizes_to_offsets(d_offsets, d_offsets + input.size() + 1, d_offsets, stream);
-  CUDF_EXPECTS(bytes <= std::numeric_limits<size_type>::max(),
-               "Size of output exceeds the column size limit",
-               std::overflow_error);
-
-  rmm::device_uvector<char> chars(bytes, stream, mr);
-  // second pass, write output
-  converter.d_offsets = d_offsets;
-  converter.d_chars   = chars.data();
-  thrust::for_each_n(rmm::exec_policy(stream), count_itr, input.size(), converter);
+  auto [offsets, bytes] = [&] {
+    rmm::device_uvector<size_type> sizes(input.size(), stream);
+    constexpr int block_size = 512;
+    cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
+    count_bytes_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+      ccfn, *d_strings, sizes.data());
+    // convert sizes to offsets
+    return cudf::strings::detail::make_offsets_child_column(sizes.begin(), sizes.end(), stream, mr);
+  }();
+
+  // build sub-offsets
+  auto const input_chars = input.chars_begin(stream);
+  auto const sub_count   = chars_size / LS_SUB_BLOCK_SIZE;
+  auto tmp_offsets       = rmm::device_uvector<int64_t>(sub_count + input.size() + 1, stream);
+  {
+    rmm::device_uvector<size_type> sub_offsets(sub_count, stream);
+    auto const count_itr = thrust::make_counting_iterator<size_type>(0);
+    thrust::transform(rmm::exec_policy_nosync(stream),
+                      count_itr,
+                      count_itr + sub_count,
+                      sub_offsets.data(),
+                      sub_offset_fn{input_chars, first_offset, last_offset});
+
+    // merge them with input offsets
+    auto input_offsets =
+      cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
+    thrust::merge(rmm::exec_policy_nosync(stream),
+                  input_offsets,
+                  input_offsets + input.size() + 1,
+                  sub_offsets.begin(),
+                  sub_offsets.end(),
+                  tmp_offsets.begin());
+  }
+
+  // run case conversion over the new sub-strings
+  auto const tmp_size = static_cast<size_type>(tmp_offsets.size()) - 1;
+  upper_lower_ls_fn sub_conv{ccfn, input_chars, tmp_offsets.data()};
+  auto chars =
+    std::get<1>(cudf::strings::detail::make_strings_children(sub_conv, tmp_size, stream, mr));
 
   return make_strings_column(input.size(),
                              std::move(offsets),
diff --git a/cpp/tests/strings/case_tests.cpp b/cpp/tests/strings/case_tests.cpp
index 1d82d785ae8..bb0e77a29d0 100644
--- a/cpp/tests/strings/case_tests.cpp
+++ b/cpp/tests/strings/case_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -235,7 +235,7 @@ TEST_F(StringsCaseTest, LongStrings)
 {
   // average string length >= AVG_CHAR_BYTES_THRESHOLD as defined in case.cu
   cudf::test::strings_column_wrapper input{
-    "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
+    "abcdéfghijklmnopqrstuvwxyzABCDÉFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=- ",
     "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
     "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
     "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=-"};
@@ -256,7 +256,8 @@ TEST_F(StringsCaseTest, LongStrings)
   results = cudf::strings::to_upper(view);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  results = cudf::strings::to_upper(cudf::strings_column_view(cudf::slice(input, {1, 3}).front()));
+  view    = cudf::strings_column_view(cudf::slice(input, {1, 3}).front());
+  results = cudf::strings::to_upper(view);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, cudf::slice(expected, {1, 3}).front());
 }
 

From 9cc87f01810d598eca4b80ce95b4c1eb72617a3a Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 12 Apr 2024 14:30:27 -1000
Subject: [PATCH 067/272] Skip pandas unit tests that crash pytest workers in
 `cudf.pandas` (#15521)

While enabling some ignored pandas unit tests for `cudf.pandas`, tests were passing in the specified 30 minute allotment, but it appears some of these newly enabled tests are still causing pytest workers to crash. I think it's OK to lose some testing coverage of these tests if it means test runners are not crashing

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15521
---
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 2f6c4ac5b13..e21c4572e44 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -130,10 +130,15 @@ and not test_s3_roundtrip_for_dir[partition_col0] \
 and not test_s3_roundtrip_for_dir[partition_col1] \
 and not test_s3_roundtrip"
 
+TEST_THAT_CRASH_PYTEST_WORKERS="not test_bitmasks_pyarrow \
+and not test_large_string_pyarrow \
+and not test_interchange_from_corrected_buffer_dtypes \
+and not test_eof_states"
+
 # TODO: Remove "not db" once a postgres & mysql container is set up on the CI
 PANDAS_CI="1" timeout 30m python -m pytest -p cudf.pandas \
     -v -m "not single_cpu and not db" \
-    -k "not test_to_parquet_gcs_new_file and not test_qcut_nat and not test_add and not test_ismethods and $TEST_THAT_NEED_MOTO_SERVER" \
+    -k "not test_to_parquet_gcs_new_file and not test_qcut_nat and not test_add and not test_ismethods and $TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS" \
     --import-mode=importlib \
     ${PYTEST_IGNORES} \
     "$@" || [ $? = 1 ]  # Exit success if exit code was 1 (permit test failures but not other errors)

From c8cb4953550dc7b0e0f30c9d33ef55e25f935ef4 Mon Sep 17 00:00:00 2001
From: Ray Bell <rayjohnbell0@gmail.com>
Date: Sat, 13 Apr 2024 13:47:11 -0400
Subject: [PATCH 068/272] Update CONTRIBUTING.md to use latest cuda env
 (#15467)

clean commit of https://github.com/rapidsai/cudf/pull/15401

Authors:
  - Ray Bell (https://github.com/raybellwaves)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15467
---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index dce92d7e613..757eaa44510 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -105,7 +105,7 @@ Instructions for a minimal build environment without conda are included below.
 # create the conda environment (assuming in base `cudf` directory)
 # note: RAPIDS currently doesn't support `channel_priority: strict`;
 # use `channel_priority: flexible` instead
-conda env create --name cudf_dev --file conda/environments/all_cuda-118_arch-x86_64.yaml
+conda env create --name cudf_dev --file conda/environments/all_cuda-122_arch-x86_64.yaml
 # activate the environment
 conda activate cudf_dev
 ```

From 8beb4cea15602c081e3f948ceee181730d74a296 Mon Sep 17 00:00:00 2001
From: Ray Bell <rayjohnbell0@gmail.com>
Date: Mon, 15 Apr 2024 07:36:14 -0400
Subject: [PATCH 069/272] rm-dup-doc in frame.py (#15530)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extended view of the diff

![Screenshot 2024-04-13 at 11 02 32 PM](https://github.com/rapidsai/cudf/assets/17162724/e6cd36b1-73b3-4910-b186-eb0906ea1fa6)

Found a couple of others whilst looking into https://github.com/rapidsai/cudf/issues/15487

Authors:
  - Ray Bell (https://github.com/raybellwaves)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15530
---
 python/cudf/cudf/core/frame.py         | 12 ------------
 python/cudf/cudf/core/indexed_frame.py |  5 -----
 2 files changed, 17 deletions(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 809bdb4e6d1..01842b5f0a9 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1808,12 +1808,6 @@ def all(self, axis=0, skipna=True, **kwargs):
         b    False
         dtype: bool
 
-        .. pandas-compat::
-            **DataFrame.all, Series.all**
-
-            Parameters currently not supported are `axis`, `bool_only`,
-            `level`.
-
         .. pandas-compat::
             **DataFrame.all, Series.all**
 
@@ -1867,12 +1861,6 @@ def any(self, axis=0, skipna=True, **kwargs):
         b    True
         dtype: bool
 
-        .. pandas-compat::
-            **DataFrame.any, Series.any**
-
-            Parameters currently not supported are `axis`, `bool_only`,
-            `level`.
-
         .. pandas-compat::
             **DataFrame.any, Series.any**
 
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index ca9d5590044..c412b7a7e47 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -1530,11 +1530,6 @@ def median(
         >>> ser.median()
         17.0
 
-        .. pandas-compat::
-            **DataFrame.median, Series.median**
-
-            Parameters currently not supported are `level` and `numeric_only`.
-
         .. pandas-compat::
             **DataFrame.median, Series.median**
 

From 1403e1b0b378397261d7cfa0025f791bb289f1e8 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 15 Apr 2024 08:20:27 -0500
Subject: [PATCH 070/272] Remove version hard-coding (#15529)

This PR removes version hard-coding introduced in https://github.com/rapidsai/cudf/pull/15516

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15529
---
 ci/cudf_pandas_scripts/pandas-tests/run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh
index d13d31ad09f..abde5e5d160 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/run.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh
@@ -27,7 +27,7 @@ bash python/cudf/cudf/pandas/scripts/run-pandas-tests.sh \
   --dist worksteal \
   --report-log=${PANDAS_TESTS_BRANCH}.json 2>&1
 
-SUMMARY_FILE_NAME=${PANDAS_TESTS_BRANCH}-24.06-results.json
+SUMMARY_FILE_NAME=${PANDAS_TESTS_BRANCH}-${RAPIDS_FULL_VERSION}-results.json
 # summarize the results and save them to artifacts:
 python python/cudf/cudf/pandas/scripts/summarize-test-results.py --output json pandas-testing/${PANDAS_TESTS_BRANCH}.json > pandas-testing/${SUMMARY_FILE_NAME}
 RAPIDS_ARTIFACTS_DIR=${RAPIDS_ARTIFACTS_DIR:-"${PWD}/artifacts"}

From 64229b91283c3c7e1237962b294e1c38d1bffb35 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 15 Apr 2024 06:26:27 -1000
Subject: [PATCH 071/272] Make some private class properties not settable
 (#15527)

It appears these properties do not map to public APIs, and for better state management, these `@property`s are better left not settable.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15527
---
 python/cudf/cudf/core/column/categorical.py  | 6 ------
 python/cudf/cudf/core/single_column_frame.py | 5 -----
 2 files changed, 11 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index e4620ee5bc4..e3e73035046 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -614,12 +614,6 @@ def children(self) -> Tuple[NumericalColumn]:
     def categories(self) -> ColumnBase:
         return self.dtype.categories._values
 
-    @categories.setter
-    def categories(self, value):
-        self._dtype = CategoricalDtype(
-            categories=value, ordered=self.dtype.ordered
-        )
-
     @property
     def codes(self) -> NumericalColumn:
         if self._codes is None:
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 19dde2e51b9..829790007c9 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -103,11 +103,6 @@ def _num_columns(self):
     def _column(self):
         return self._data[self.name]
 
-    @_column.setter  # type: ignore
-    @_cudf_nvtx_annotate
-    def _column(self, value):
-        self._data[self.name] = value
-
     @property  # type: ignore
     @_cudf_nvtx_annotate
     def values(self):  # noqa: D102

From ca7d85b2beb3d82161ceda642038f3f082900650 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 15 Apr 2024 11:26:51 -0500
Subject: [PATCH 072/272] Relax protobuf lower bound to 3.20. (#15506)

This PR drops the lower bound of protobuf to 3.20, to make cuDF compatible with the versions used in Google Colab.

I tested this manually in Google Colab, which uses protobuf 3.20, and cuDF 24.02 seemed to work fine when reading ORC statistics (the only runtime feature in cuDF that needs protobuf). Note: cuDF 24.02 was built was a newer protobuf/protoc, version 4.x.

I will test this by forcing protobuf 3.20 in CI, and then revert those changes if tests pass.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Nick Becker (https://github.com/beckernick)
  - Ray Douglass (https://github.com/raydouglass)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15506
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-122_arch-x86_64.yaml | 2 +-
 conda/recipes/cudf/meta.yaml                     | 2 +-
 dependencies.yaml                                | 2 +-
 python/cudf/pyproject.toml                       | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 82d7104b0da..e629f8b633e 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -68,7 +68,7 @@ dependencies:
 - pandoc
 - pip
 - pre-commit
-- protobuf>=4.21,<5
+- protobuf>=3.20,<5
 - ptxcompiler
 - pyarrow==14.0.2.*
 - pydata-sphinx-theme!=0.14.2
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 0fd87e91745..f135a88cac2 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -66,7 +66,7 @@ dependencies:
 - pandoc
 - pip
 - pre-commit
-- protobuf>=4.21,<5
+- protobuf>=3.20,<5
 - pyarrow==14.0.2.*
 - pydata-sphinx-theme!=0.14.2
 - pynvjitlink
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 7633fbb00a3..cd9237bd7cb 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -78,7 +78,7 @@ requirements:
     {% endif %}
     - cuda-version ={{ cuda_version }}
   run:
-    - {{ pin_compatible('protobuf', min_pin='x.x', max_pin='x') }}
+    - protobuf >=3.20,<5.0a0
     - python
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.2dev0
diff --git a/dependencies.yaml b/dependencies.yaml
index 5bb555df818..8cd4c798c38 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -286,7 +286,7 @@ dependencies:
       - output_types: conda
         packages:
           - &rmm_conda rmm==24.6.*
-          - &protobuf protobuf>=4.21,<5
+          - &protobuf protobuf>=3.20,<5
           - pip
           - pip:
               - git+https://github.com/python-streamz/streamz.git@master
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 003a92988de..434383bc208 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -34,7 +34,7 @@ dependencies = [
     "nvtx>=0.2.1",
     "packaging",
     "pandas>=2.0,<2.2.2dev0",
-    "protobuf>=4.21,<5",
+    "protobuf>=3.20,<5",
     "ptxcompiler",
     "pyarrow>=14.0.1,<15.0.0a0",
     "rich",

From 74b39e213a4e6a6a1cf9f0e8d19a112fc6639214 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 15 Apr 2024 14:57:25 -0400
Subject: [PATCH 073/272] Fix exponent overflow in strings-to-double conversion
 (#15517)

Adds a check when computing the exponent in the strings-to-double conversion to prevent an integer overflow.

Closes #15508

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15517
---
 .../detail/convert/string_to_float.cuh        |  5 ++-
 cpp/tests/strings/floats_tests.cpp            | 36 ++++++++++---------
 2 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/convert/string_to_float.cuh b/cpp/include/cudf/strings/detail/convert/string_to_float.cuh
index ab934750f9e..bbf56cf1446 100644
--- a/cpp/include/cudf/strings/detail/convert/string_to_float.cuh
+++ b/cpp/include/cudf/strings/detail/convert/string_to_float.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -102,6 +102,9 @@ __device__ inline double stod(string_view const& d_str)
           ch = *in_ptr++;
           if (ch < '0' || ch > '9') break;
           exp_ten = (exp_ten * 10) + (int)(ch - '0');
+          // Prevent integer overflow in exp_ten. 100,000,000 is the largest
+          // power of ten that can be multiplied by 10 without overflow.
+          if (exp_ten >= 100'000'000) { break; }
         }
       }
     }
diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp
index f668c384787..9fa1a3325b4 100644
--- a/cpp/tests/strings/floats_tests.cpp
+++ b/cpp/tests/strings/floats_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/strings/convert/convert_floats.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -25,8 +26,6 @@
 
 #include <vector>
 
-constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
-
 struct StringsConvertTest : public cudf::test::BaseFixture {};
 
 TEST_F(StringsConvertTest, IsFloat)
@@ -89,7 +88,7 @@ TEST_F(StringsConvertTest, ToFloats32)
     h_expected.begin(),
     h_expected.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsConvertTest, FromFloats32)
@@ -118,38 +117,41 @@ TEST_F(StringsConvertTest, FromFloats32)
     h_expected.end(),
     thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsConvertTest, ToFloats64)
 {
   // clang-format off
   std::vector<const char*> h_strings{
-    "1234",   nullptr,    "-876",     "543.2",         "-0.12",   ".25",
+    "1234",   "",         "-876",     "543.2",         "-0.12",   ".25",
     "-.002",  "",         "-0.0",     "1.28e256",      "NaN",     "abc123",
     "123abc", "456e",     "-1.78e+5", "-122.33644782", "12e+309", "1.7976931348623159E308",
     "-Inf",   "-INFINITY", "1.0",     "1.7976931348623157e+308",  "1.7976931348623157e-307",
     // subnormal numbers:           v--- smallest double               v--- result is 0
-    "4e-308", "3.3333333333e-320", "4.940656458412465441765688e-324", "1.e-324" };
+    "4e-308", "3.3333333333e-320", "4.940656458412465441765688e-324", "1.e-324",
+    // another very small number
+    "9.299999257686047e-0005603333574677677" };
   // clang-format on
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  auto validity = cudf::test::iterators::null_at(1);
+  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
 
   std::vector<double> h_expected;
   std::for_each(h_strings.begin(), h_strings.end(), [&](char const* str) {
-    h_expected.push_back(str ? std::atof(str) : 0);
+    h_expected.push_back(std::atof(str));
   });
 
   auto strings_view = cudf::strings_column_view(strings);
   auto results = cudf::strings::to_floats(strings_view, cudf::data_type{cudf::type_id::FLOAT64});
 
   cudf::test::fixed_width_column_wrapper<double> expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
+    h_expected.begin(), h_expected.end(), validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+
+  results = cudf::strings::is_float(strings_view);
+  cudf::test::fixed_width_column_wrapper<bool> is_expected(
+    {1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, is_expected);
 }
 
 TEST_F(StringsConvertTest, FromFloats64)
@@ -178,7 +180,7 @@ TEST_F(StringsConvertTest, FromFloats64)
     h_expected.end(),
     thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsConvertTest, ZeroSizeStringsColumnFloat)

From 89196900f5739a39bd9861d3b494b47ff75e7f71 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 15 Apr 2024 20:42:00 -0500
Subject: [PATCH 074/272] Remove conda channel setup from wheel CI image
 script. (#15539)

The new `configure_cpp_static.sh` script added in https://github.com/rapidsai/cudf/pull/15437 is calling `rapids-configure-conda-channels`. However, it is doing so on a `ci-wheel` image, which fails. This is causing CI issues and needs to be removed.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15539
---
 ci/configure_cpp_static.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ci/configure_cpp_static.sh b/ci/configure_cpp_static.sh
index 675e0c3981f..d1f9e0d1399 100755
--- a/ci/configure_cpp_static.sh
+++ b/ci/configure_cpp_static.sh
@@ -3,8 +3,6 @@
 
 set -euo pipefail
 
-rapids-configure-conda-channels
-
 source rapids-date-string
 
 rapids-logger "Configure static cpp build"

From c1dcc31c07e858dfc0f24ff77e5b111551ad8a0e Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 16 Apr 2024 12:54:56 +0100
Subject: [PATCH 075/272] Handle case of scan aggregation in groupby-transform
 (#15450)

When performing a groupby-transform with a scan aggregation, the intermediate result obtained from calling groupby-agg is already the correct shape and does not need to be broadcast to align with the grouping keys.

To handle this, make sure that if the requested transform is a scan that we don't try and broadcast.

While here, tighten up the input checking: transform only applies to a single aggregation, rather than the more general interface offered by agg.

- Closes #12621
- Closes #15448

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15450
---
 python/cudf/cudf/core/groupby/groupby.py      | 12 +++++-
 .../cudf/cudf/tests/groupby/test_transform.py | 43 +++++++++++++++++++
 2 files changed, 54 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf/cudf/tests/groupby/test_transform.py

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 945e546af1a..dd4924676f3 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1767,13 +1767,23 @@ def transform(self, function):
         --------
         agg
         """
+        if not (isinstance(function, str) or callable(function)):
+            raise TypeError(
+                "Aggregation must be a named aggregation or a callable"
+            )
         try:
             result = self.agg(function)
         except TypeError as e:
             raise NotImplementedError(
                 "Currently, `transform()` supports only aggregations."
             ) from e
-
+        # If the aggregation is a scan, don't broadcast
+        if libgroupby._is_all_scan_aggregate([[function]]):
+            if len(result) != len(self.obj):
+                raise AssertionError(
+                    "Unexpected result length for scan transform"
+                )
+            return result
         return self._broadcast(result)
 
     def rolling(self, *args, **kwargs):
diff --git a/python/cudf/cudf/tests/groupby/test_transform.py b/python/cudf/cudf/tests/groupby/test_transform.py
new file mode 100644
index 00000000000..78d7fbfd879
--- /dev/null
+++ b/python/cudf/cudf/tests/groupby/test_transform.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import itertools
+
+import pytest
+
+import cudf
+from cudf.testing._utils import assert_eq
+
+
+@pytest.fixture(params=[False, True], ids=["no-null-keys", "null-keys"])
+def keys_null(request):
+    return request.param
+
+
+@pytest.fixture(params=[False, True], ids=["no-null-values", "null-values"])
+def values_null(request):
+    return request.param
+
+
+@pytest.fixture
+def df(keys_null, values_null):
+    keys = ["a", "b", "a", "c", "b", "b", "c", "a"]
+    r = range(len(keys))
+    if keys_null:
+        keys[::3] = itertools.repeat(None, len(r[::3]))
+    values = list(range(len(keys)))
+    if values_null:
+        values[1::3] = itertools.repeat(None, len(r[1::3]))
+    return cudf.DataFrame({"key": keys, "values": values})
+
+
+@pytest.mark.parametrize("agg", ["cumsum", "cumprod", "max", "sum", "prod"])
+def test_transform_broadcast(agg, df):
+    pf = df.to_pandas()
+    got = df.groupby("key").transform(agg)
+    expect = pf.groupby("key").transform(agg)
+    assert_eq(got, expect, check_dtype=False)
+
+
+def test_transform_invalid():
+    df = cudf.DataFrame({"key": [1, 1], "values": [4, 5]})
+    with pytest.raises(TypeError):
+        df.groupby("key").transform({"values": "cumprod"})

From 77abf03a21ac22aaed48eb8ad627bbb37e81315c Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 16 Apr 2024 09:05:18 -0500
Subject: [PATCH 076/272] Remove legacy JSON reader from Python (#15538)

This PR removes the `engine="cudf_legacy"` option from Python.

This is a part of https://github.com/rapidsai/cudf/issues/15537.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15538
---
 python/cudf/cudf/io/json.py         | 23 ++---------------
 python/cudf/cudf/tests/test_json.py | 39 +----------------------------
 python/cudf/cudf/utils/ioutils.py   |  2 +-
 3 files changed, 4 insertions(+), 60 deletions(-)

diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index b2f3fd09146..5ef25a99590 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -38,25 +38,6 @@ def read_json(
             f"or a bool, or None. Got {type(dtype)}"
         )
 
-    if engine == "cudf_experimental":
-        raise ValueError(
-            "engine='cudf_experimental' support has been removed, "
-            "use `engine='cudf'`"
-        )
-
-    if engine == "cudf_legacy":
-        # TODO: Deprecated in 23.02, please
-        # give some time until(more than couple of
-        # releases from now) `cudf_legacy`
-        # support can be removed completely.
-        warnings.warn(
-            "engine='cudf_legacy' is a deprecated engine."
-            "This will be removed in a future release."
-            "Please switch to using engine='cudf'.",
-            FutureWarning,
-        )
-    if engine == "cudf_legacy" and not lines:
-        raise ValueError(f"{engine} engine only supports JSON Lines format")
     if engine == "auto":
         engine = "cudf" if lines else "pandas"
     if engine != "cudf" and keep_quotes:
@@ -64,7 +45,7 @@ def read_json(
             "keep_quotes='True' is supported only with engine='cudf'"
         )
 
-    if engine == "cudf_legacy" or engine == "cudf":
+    if engine == "cudf":
         if dtype is None:
             dtype = True
 
@@ -117,7 +98,7 @@ def read_json(
             lines,
             compression,
             byte_range,
-            engine == "cudf_legacy",
+            False,
             keep_quotes,
             mixed_types_as_string,
         )
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 40935733f34..3033a3e75e3 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -495,9 +495,6 @@ def test_json_lines_compression(tmpdir, ext, out_comp, in_comp):
 
 
 @pytest.mark.filterwarnings("ignore:Using CPU")
-@pytest.mark.filterwarnings(
-    "ignore:engine='cudf_legacy' is a deprecated engine."
-)
 def test_json_engine_selection():
     json = "[1, 2, 3]"
 
@@ -519,10 +516,6 @@ def test_json_engine_selection():
     for col_name in df.columns:
         assert isinstance(col_name, int)
 
-    # should raise an exception
-    with pytest.raises(ValueError):
-        cudf.read_json(StringIO(json), lines=False, engine="cudf_legacy")
-
 
 def test_json_bool_values():
     buffer = "[true,1]\n[false,false]\n[true,true]"
@@ -541,30 +534,6 @@ def test_json_bool_values():
     np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes)
 
 
-@pytest.mark.filterwarnings(
-    "ignore:engine='cudf_legacy' is a deprecated engine."
-)
-@pytest.mark.parametrize(
-    "buffer",
-    [
-        "[1.0,]\n[null, ]",
-        '{"0":1.0,"1":}\n{"0":null,"1": }',
-        '{ "0" : 1.0 , "1" : }\n{ "0" : null , "1" : }',
-        '{"0":1.0}\n{"1":}',
-    ],
-)
-def test_json_null_literal(buffer):
-    df = cudf.read_json(StringIO(buffer), lines=True, engine="cudf_legacy")
-
-    # first column contains a null field, type should be set to float
-    # second column contains only empty fields, type should be set to int8
-    np.testing.assert_array_equal(df.dtypes, ["float64", "int8"])
-    np.testing.assert_array_equal(
-        df["0"].to_numpy(na_value=np.nan), [1.0, np.nan]
-    )
-    np.testing.assert_array_equal(df["1"].to_numpy(na_value=0), [0, 0])
-
-
 def test_json_bad_protocol_string():
     test_string = StringIO('{"field": "s3://path"}')
 
@@ -739,14 +708,8 @@ def test_default_integer_bitwidth(default_integer_bitwidth, engine):
 @pytest.mark.parametrize(
     "engine",
     [
-        pytest.param(
-            "cudf_legacy",
-            marks=pytest.mark.skip(
-                reason="cannot partially set dtypes for cudf json engine"
-            ),
-        ),
-        "pandas",
         "cudf",
+        "pandas",
     ],
 )
 def test_default_integer_bitwidth_partial(default_integer_bitwidth, engine):
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 0a0ee4f592c..8c58f2b859e 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -543,7 +543,7 @@
     function or `StringIO`). Multiple inputs may be provided as a list. If a
     list is specified each list entry may be of a different input type as long
     as each input is of a valid type and all input JSON schema(s) match.
-engine : {{ 'auto', 'cudf', 'cudf_legacy', 'pandas' }}, default 'auto'
+engine : {{ 'auto', 'cudf', 'pandas' }}, default 'auto'
     Parser engine to use. If 'auto' is passed, the engine will be
     automatically selected based on the other parameters. See notes below.
 orient : string

From feb96cbe39e36d35d673c51270f7316708d24f67 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Tue, 16 Apr 2024 12:33:52 -0400
Subject: [PATCH 077/272] Benchmark decimal <--> floating conversions. (#15334)

Adds benchmarks for decimal <--> floating conversions.  Does so for float <--> decimal32 & decimal64, and for double <--> decimal32, decimal64, and decimal128.  Within a column data tends to be in a similar range of values, so this provides separate tests for different representative ranges of powers-of-10.

Note that with the current conversion algorithm, the max supported scale of a decimal is the max power of 10 that that type can hold, so scale 9 for decimal32, 19 for decimal64, and 38 for decimal128. Thus only these ranges of floats/doubles are tested.

Also adds the ability to generate decimals with a specific (rather than random) scale factor. This expands the API, it does not replace the existing one.  All existing tests that generate a column of random decimals will continue to do so with a random scale factor, this capability is opt-in.  The machinery for this was already there, but only partially; this change fills it in.

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/15334
---
 cpp/benchmarks/CMakeLists.txt               |   5 +
 cpp/benchmarks/common/generate_input.cu     |   5 +-
 cpp/benchmarks/common/generate_input.hpp    |  42 ++++-
 cpp/benchmarks/decimal/convert_floating.cpp | 167 ++++++++++++++++++++
 4 files changed, 211 insertions(+), 8 deletions(-)
 create mode 100644 cpp/benchmarks/decimal/convert_floating.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index b384f6d5674..571780888c0 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -339,6 +339,11 @@ ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)
 ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp)
 target_link_libraries(MULTIBYTE_SPLIT_NVBENCH PRIVATE ZLIB::ZLIB)
 
+# ##################################################################################################
+# * decimal benchmark
+# ---------------------------------------------------------------------------------
+ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp)
+
 add_custom_target(
   run_benchmarks
   DEPENDS CUDF_BENCHMARKS
diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index 9857aac4473..6df2cb44adc 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -324,10 +324,11 @@ struct random_value_fn<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
   distribution_fn<DeviceType> dist;
   std::optional<numeric::scale_type> scale;
 
-  random_value_fn(distribution_params<DeviceType> const& desc)
+  random_value_fn(distribution_params<T> const& desc)
     : lower_bound{desc.lower_bound},
       upper_bound{desc.upper_bound},
-      dist{make_distribution<DeviceType>(desc.id, desc.lower_bound, desc.upper_bound)}
+      dist{make_distribution<DeviceType>(desc.id, lower_bound, upper_bound)},
+      scale{desc.scale}
   {
   }
 
diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp
index 31dc2673d70..68d3dc492f5 100644
--- a/cpp/benchmarks/common/generate_input.hpp
+++ b/cpp/benchmarks/common/generate_input.hpp
@@ -182,9 +182,17 @@ struct distribution_params<T, std::enable_if_t<std::is_same_v<T, cudf::struct_vi
   cudf::size_type max_depth;
 };
 
-// Present for compilation only. To be implemented once reader/writers support the fixed width type.
+/**
+ * @brief Fixed-point values are parameterized with a distribution type, scale, and bounds of the
+ * same type.
+ */
 template <typename T>
-struct distribution_params<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {};
+struct distribution_params<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
+  distribution_id id;
+  typename T::rep lower_bound;
+  typename T::rep upper_bound;
+  std::optional<numeric::scale_type> scale;
+};
 
 /**
  * @brief Returns a vector of types, corresponding to the input type or a type group.
@@ -226,7 +234,7 @@ class data_profile {
     cudf::type_id::INT32, {distribution_id::GEOMETRIC, 0, 64}, 2};
   distribution_params<cudf::struct_view> struct_dist_desc{
     {cudf::type_id::INT32, cudf::type_id::FLOAT32, cudf::type_id::STRING}, 2};
-  std::map<cudf::type_id, distribution_params<__uint128_t>> decimal_params;
+  std::map<cudf::type_id, distribution_params<numeric::decimal128>> decimal_params;
 
   double bool_probability_true           = 0.5;
   std::optional<double> null_probability = 0.01;
@@ -300,16 +308,21 @@ class data_profile {
   }
 
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-  distribution_params<typename T::rep> get_distribution_params() const
+  distribution_params<T> get_distribution_params() const
   {
     using rep = typename T::rep;
     auto it   = decimal_params.find(cudf::type_to_id<T>());
     if (it == decimal_params.end()) {
       auto const range = default_range<rep>();
-      return distribution_params<rep>{default_distribution_id<rep>(), range.first, range.second};
+      auto const scale = std::optional<numeric::scale_type>{};
+      return distribution_params<T>{
+        default_distribution_id<rep>(), range.first, range.second, scale};
     } else {
       auto& desc = it->second;
-      return {desc.id, static_cast<rep>(desc.lower_bound), static_cast<rep>(desc.upper_bound)};
+      return {desc.id,
+              static_cast<rep>(desc.lower_bound),
+              static_cast<rep>(desc.upper_bound),
+              desc.scale};
     }
   }
 
@@ -359,6 +372,23 @@ class data_profile {
     }
   }
 
+  // Users should pass integral values for bounds when setting the parameters for fixed-point.
+  // Otherwise the call with have no effect.
+  template <typename T,
+            typename Type_enum,
+            std::enable_if_t<cuda::std::is_integral_v<T>, T>* = nullptr>
+  void set_distribution_params(Type_enum type_or_group,
+                               distribution_id dist,
+                               T lower_bound,
+                               T upper_bound,
+                               numeric::scale_type scale)
+  {
+    for (auto tid : get_type_or_group(static_cast<int32_t>(type_or_group))) {
+      decimal_params[tid] = {
+        dist, static_cast<__int128_t>(lower_bound), static_cast<__int128_t>(upper_bound), scale};
+    }
+  }
+
   template <typename T, typename Type_enum, std::enable_if_t<cudf::is_chrono<T>(), T>* = nullptr>
   void set_distribution_params(Type_enum type_or_group,
                                distribution_id dist,
diff --git a/cpp/benchmarks/decimal/convert_floating.cpp b/cpp/benchmarks/decimal/convert_floating.cpp
new file mode 100644
index 00000000000..a367036c494
--- /dev/null
+++ b/cpp/benchmarks/decimal/convert_floating.cpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/types.hpp>
+#include <cudf/unary.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <type_traits>
+
+// This benchmark compares the cost of converting decimal <--> floating point
+template <typename InputType, typename OutputType>
+void bench_cast_decimal(nvbench::state& state, nvbench::type_list<InputType, OutputType>)
+{
+  static constexpr bool is_input_floating  = std::is_floating_point_v<InputType>;
+  static constexpr bool is_output_floating = std::is_floating_point_v<OutputType>;
+
+  static constexpr bool is_double =
+    std::is_same_v<InputType, double> || std::is_same_v<OutputType, double>;
+  static constexpr bool is_32bit =
+    std::is_same_v<InputType, numeric::decimal32> || std::is_same_v<OutputType, numeric::decimal32>;
+  static constexpr bool is_128bit = std::is_same_v<InputType, numeric::decimal128> ||
+                                    std::is_same_v<OutputType, numeric::decimal128>;
+
+  // Skip floating --> floating and decimal --> decimal
+  if constexpr (is_input_floating == is_output_floating) {
+    state.skip("Meaningless conversion.");
+    return;
+  }
+
+  // Skip float <--> dec128
+  if constexpr (!is_double && is_128bit) {
+    state.skip("Ignoring float <--> dec128.");
+    return;
+  }
+
+  // Get settings
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const exp_mode = state.get_int64("exp_range");
+
+  // Exponent range: Range size is 10^6
+  // These probe the edges of the float and double ranges, as well as more common values
+  int const exp_min_array[] = {-307, -37, -14, -3, 8, 31, 301};
+  int const exp_range_size  = 6;
+  int const exp_min         = exp_min_array[exp_mode];
+  int const exp_max         = exp_min + exp_range_size;
+
+  // With exp range size of 6, decimal output (generated or casted-to) has 7 digits of precision
+  int const extra_digits_precision = 1;
+
+  // Exclude end range of double from float test
+  if (!is_double && ((exp_mode == 0) || (exp_mode == 6))) {
+    state.skip("Range beyond end of float tests.");
+    return;
+  }
+
+  // The current float <--> decimal conversion algorithm is limited
+  static constexpr bool is_64bit = !is_32bit && !is_128bit;
+  if (is_32bit && (exp_mode != 3)) {
+    state.skip("Decimal32 conversion only works up to scale factors of 10^9.");
+    return;
+  }
+  if (is_64bit && ((exp_mode < 2) || (exp_mode > 4))) {
+    state.skip("Decimal64 conversion only works up to scale factors of 10^18.");
+    return;
+  }
+  if (is_128bit && ((exp_mode == 0) || (exp_mode == 6))) {
+    state.skip("Decimal128 conversion only works up to scale factors of 10^38.");
+    return;
+  }
+
+  // Type IDs
+  auto const input_id  = cudf::type_to_id<InputType>();
+  auto const output_id = cudf::type_to_id<OutputType>();
+
+  // Create data profile and scale
+  auto const [output_scale, profile] = [&]() {
+    if constexpr (is_input_floating) {
+      // Range for generated floating point values
+      auto get_pow10 = [](auto exp10) {
+        return std::pow(static_cast<InputType>(10), static_cast<InputType>(exp10));
+      };
+      InputType const floating_range_min = get_pow10(exp_min);
+      InputType const floating_range_max = get_pow10(exp_max);
+
+      // With exp range size of 6, output has 7 decimal digits of precision
+      auto const decimal_output_scale = exp_min - extra_digits_precision;
+
+      // Input distribution
+      data_profile const profile = data_profile_builder().distribution(
+        input_id, distribution_id::NORMAL, floating_range_min, floating_range_max);
+
+      return std::pair{decimal_output_scale, profile};
+
+    } else {  // Generating decimals
+
+      using decimal_rep_type = typename InputType::rep;
+
+      // For exp range size 6 and precision 7, generates ints between 10 and 10^7,
+      // with scale factor of: exp_max - 7. This matches floating point generation.
+      int const digits_precision     = exp_range_size + extra_digits_precision;
+      auto const decimal_input_scale = numeric::scale_type{exp_max - digits_precision};
+
+      // Range for generated integer values
+      auto get_pow10 = [](auto exp10) {
+        return numeric::detail::ipow<decimal_rep_type, numeric::Radix::BASE_10>(exp10);
+      };
+      auto const decimal_range_min = get_pow10(digits_precision - exp_range_size);
+      auto const decimal_range_max = get_pow10(digits_precision);
+
+      // Input distribution
+      data_profile const profile = data_profile_builder().distribution(input_id,
+                                                                       distribution_id::NORMAL,
+                                                                       decimal_range_min,
+                                                                       decimal_range_max,
+                                                                       decimal_input_scale);
+
+      return std::pair{0, profile};
+    }
+  }();
+
+  // Generate input data
+  auto const input_col  = create_random_column(input_id, row_count{num_rows}, profile);
+  auto const input_view = input_col->view();
+
+  // Output type
+  auto const output_type =
+    !is_input_floating ? cudf::data_type(output_id) : cudf::data_type(output_id, output_scale);
+
+  // Stream
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+
+  // Run benchmark
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { cudf::cast(input_view, output_type); });
+
+  // Throughput statistics
+  state.add_element_count(num_rows);
+  state.add_global_memory_reads<InputType>(num_rows);
+  state.add_global_memory_writes<OutputType>(num_rows);
+}
+
+// Data types
+using data_types =
+  nvbench::type_list<float, double, numeric::decimal32, numeric::decimal64, numeric::decimal128>;
+
+NVBENCH_BENCH_TYPES(bench_cast_decimal, NVBENCH_TYPE_AXES(data_types, data_types))
+  .set_name("decimal_floating_conversion")
+  .set_type_axes_names({"InputType", "OutputType"})
+  .add_int64_power_of_two_axis("num_rows", {28})
+  .add_int64_axis("exp_range", nvbench::range(0, 6));

From 61e116eb873fca6f611c43aa909c177aeacb6f02 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 16 Apr 2024 11:08:09 -0700
Subject: [PATCH 078/272] Removing all batching code from parquet writer
 (#15528)

Fixes #13440. Removing the manually disabled batching code from parquet writer.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15528
---
 cpp/src/io/parquet/writer_impl.cu  | 143 ++++++++++-------------------
 cpp/src/io/parquet/writer_impl.hpp |   2 -
 2 files changed, 46 insertions(+), 99 deletions(-)

diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 5a8d96975ce..fd8d4f8bd7f 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1396,16 +1396,13 @@ void init_encoder_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
 }
 
 /**
- * @brief Encode a batch of pages.
+ * @brief Encode pages.
  *
  * @throws rmm::bad_alloc if there is insufficient space for temporary buffers
  *
  * @param chunks column chunk array
  * @param pages encoder pages array
- * @param pages_in_batch number of pages in this batch
- * @param first_page_in_batch first page in batch
- * @param rowgroups_in_batch number of rowgroups in this batch
- * @param first_rowgroup first rowgroup in batch
+ * @param num_rowgroups number of rowgroups
  * @param page_stats optional page-level statistics (nullptr if none)
  * @param chunk_stats optional chunk-level statistics (nullptr if none)
  * @param column_stats optional page-level statistics for column index (nullptr if none)
@@ -1417,10 +1414,6 @@ void init_encoder_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
  */
 void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                   device_span<EncPage> pages,
-                  uint32_t pages_in_batch,
-                  uint32_t first_page_in_batch,
-                  uint32_t rowgroups_in_batch,
-                  uint32_t first_rowgroup,
                   statistics_chunk const* page_stats,
                   statistics_chunk const* chunk_stats,
                   statistics_chunk const* column_stats,
@@ -1430,14 +1423,12 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                   bool write_v2_headers,
                   rmm::cuda_stream_view stream)
 {
-  auto batch_pages = pages.subspan(first_page_in_batch, pages_in_batch);
+  auto const num_pages = pages.size();
+  auto pages_stats     = (page_stats != nullptr)
+                           ? device_span<statistics_chunk const>(page_stats, num_pages)
+                           : device_span<statistics_chunk const>();
 
-  auto batch_pages_stats =
-    (page_stats != nullptr)
-      ? device_span<statistics_chunk const>(page_stats + first_page_in_batch, pages_in_batch)
-      : device_span<statistics_chunk const>();
-
-  uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? pages_in_batch : 0;
+  uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? num_pages : 0;
 
   rmm::device_uvector<device_span<uint8_t const>> comp_in(max_comp_pages, stream);
   rmm::device_uvector<device_span<uint8_t>> comp_out(max_comp_pages, stream);
@@ -1447,7 +1438,7 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                comp_res.end(),
                compression_result{0, compression_status::FAILURE});
 
-  EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream);
+  EncodePages(pages, write_v2_headers, comp_in, comp_out, comp_res, stream);
   switch (compression) {
     case Compression::SNAPPY:
       if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) {
@@ -1480,25 +1471,23 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
   // TBD: Not clear if the official spec actually allows dynamically turning off compression at the
   // chunk-level
 
-  auto d_chunks_in_batch = chunks.device_view().subspan(first_rowgroup, rowgroups_in_batch);
-  DecideCompression(d_chunks_in_batch.flat_view(), stream);
-  EncodePageHeaders(batch_pages, comp_res, batch_pages_stats, chunk_stats, stream);
-  GatherPages(d_chunks_in_batch.flat_view(), pages, stream);
+  auto d_chunks = chunks.device_view();
+  DecideCompression(d_chunks.flat_view(), stream);
+  EncodePageHeaders(pages, comp_res, pages_stats, chunk_stats, stream);
+  GatherPages(d_chunks.flat_view(), pages, stream);
 
   // By now, the var_bytes has been calculated in InitPages, and the histograms in EncodePages.
   // EncodeColumnIndexes can encode the histograms in the ColumnIndex, and also sum up var_bytes
   // and the histograms for inclusion in the chunk's SizeStats.
   if (column_stats != nullptr) {
-    EncodeColumnIndexes(d_chunks_in_batch.flat_view(),
-                        {column_stats, pages.size()},
-                        column_index_truncate_length,
-                        stream);
+    EncodeColumnIndexes(
+      d_chunks.flat_view(), {column_stats, pages.size()}, column_index_truncate_length, stream);
   }
 
-  auto h_chunks_in_batch = chunks.host_view().subspan(first_rowgroup, rowgroups_in_batch);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(h_chunks_in_batch.data(),
-                                d_chunks_in_batch.data(),
-                                d_chunks_in_batch.flat_view().size_bytes(),
+  auto h_chunks = chunks.host_view();
+  CUDF_CUDA_TRY(cudaMemcpyAsync(h_chunks.data(),
+                                d_chunks.data(),
+                                d_chunks.flat_view().size_bytes(),
                                 cudaMemcpyDefault,
                                 stream.value()));
 
@@ -1959,33 +1948,23 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     std::fill_n(std::back_inserter(rg_to_part), num_rg_in_part[p], p);
   }
 
-  // Batch processing is no longer supported.
-  // This line disables batch processing (so batch size will no longer be limited at 1GB as before).
-  // TODO: All the relevant code will be removed in the follow-up work:
-  // https://github.com/rapidsai/cudf/issues/13440
-  auto const max_bytes_in_batch = std::numeric_limits<size_t>::max();
-
-  // Initialize batches of rowgroups to encode (mainly to limit peak memory usage)
-  std::vector<size_type> batch_list;
-  size_type num_pages           = 0;
-  size_t max_uncomp_bfr_size    = 0;
-  size_t max_comp_bfr_size      = 0;
-  size_t max_chunk_bfr_size     = 0;
-  size_type max_pages_in_batch  = 0;
-  size_t bytes_in_batch         = 0;
-  size_t comp_bytes_in_batch    = 0;
+  // Initialize rowgroups to encode
+  size_type num_pages        = 0;
+  size_t max_uncomp_bfr_size = 0;
+  size_t max_comp_bfr_size   = 0;
+  size_t max_chunk_bfr_size  = 0;
+
   size_t column_index_bfr_size  = 0;
   size_t def_histogram_bfr_size = 0;
   size_t rep_histogram_bfr_size = 0;
-  for (size_type r = 0, groups_in_batch = 0, pages_in_batch = 0; r <= num_rowgroups; r++) {
-    size_t rowgroup_size      = 0;
-    size_t comp_rowgroup_size = 0;
+  size_t rowgroup_size          = 0;
+  size_t comp_rowgroup_size     = 0;
+  for (size_type r = 0; r <= num_rowgroups; r++) {
     if (r < num_rowgroups) {
       for (int i = 0; i < num_columns; i++) {
         EncColumnChunk* ck = &chunks[r][i];
         ck->first_page     = num_pages;
         num_pages += ck->num_pages;
-        pages_in_batch += ck->num_pages;
         rowgroup_size += ck->bfr_size;
         comp_rowgroup_size += ck->compressed_size;
         max_chunk_bfr_size =
@@ -2007,29 +1986,17 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         }
       }
     }
-    // TBD: We may want to also shorten the batch if we have enough pages (not just based on size)
-    if ((r == num_rowgroups) ||
-        (groups_in_batch != 0 && bytes_in_batch + rowgroup_size > max_bytes_in_batch)) {
-      max_uncomp_bfr_size = std::max(max_uncomp_bfr_size, bytes_in_batch);
-      max_comp_bfr_size   = std::max(max_comp_bfr_size, comp_bytes_in_batch);
-      max_pages_in_batch  = std::max(max_pages_in_batch, pages_in_batch);
-      if (groups_in_batch != 0) {
-        batch_list.push_back(groups_in_batch);
-        groups_in_batch = 0;
-      }
-      bytes_in_batch      = 0;
-      comp_bytes_in_batch = 0;
-      pages_in_batch      = 0;
+    // write bfr sizes if this is the last rowgroup
+    if (r == num_rowgroups) {
+      max_uncomp_bfr_size = rowgroup_size;
+      max_comp_bfr_size   = comp_rowgroup_size;
     }
-    bytes_in_batch += rowgroup_size;
-    comp_bytes_in_batch += comp_rowgroup_size;
-    groups_in_batch++;
   }
 
   // Clear compressed buffer size if compression has been turned off
   if (compression == Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; }
 
-  // Initialize data pointers in batch
+  // Initialize data pointers
   uint32_t const num_stats_bfr =
     (stats_granularity != statistics_freq::STATISTICS_NONE) ? num_pages + num_chunks : 0;
 
@@ -2055,10 +2022,10 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   auto bfr_i = static_cast<uint8_t*>(col_idx_bfr.data());
   auto bfr_r = rep_level_histogram.data();
   auto bfr_d = def_level_histogram.data();
-  for (auto b = 0, r = 0; b < static_cast<size_type>(batch_list.size()); b++) {
+  if (num_rowgroups != 0) {
     auto bfr   = static_cast<uint8_t*>(uncomp_bfr.data());
     auto bfr_c = static_cast<uint8_t*>(comp_bfr.data());
-    for (auto j = 0; j < batch_list[b]; j++, r++) {
+    for (auto r = 0; r < num_rowgroups; r++) {
       for (auto i = 0; i < num_columns; i++) {
         EncColumnChunk& ck   = chunks[r][i];
         ck.uncompressed_bfr  = bfr;
@@ -2108,22 +2075,11 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   std::optional<writer_compression_statistics> comp_stats;
   if (collect_compression_statistics) { comp_stats = writer_compression_statistics{}; }
 
-  // Encode row groups in batches
-  for (auto b = 0, batch_r_start = 0; b < static_cast<size_type>(batch_list.size()); b++) {
-    // Count pages in this batch
-    auto const rnext               = batch_r_start + batch_list[b];
-    auto const first_page_in_batch = chunks[batch_r_start][0].first_page;
-    auto const first_page_in_next_batch =
-      (rnext < num_rowgroups) ? chunks[rnext][0].first_page : num_pages;
-    auto const pages_in_batch = first_page_in_next_batch - first_page_in_batch;
-
+  // Encode row groups
+  if (num_rowgroups != 0) {
     encode_pages(
       chunks,
       {pages.data(), pages.size()},
-      pages_in_batch,
-      first_page_in_batch,
-      batch_list[b],
-      batch_r_start,
       (stats_granularity == statistics_freq::STATISTICS_PAGE) ? page_stats.data() : nullptr,
       (stats_granularity != statistics_freq::STATISTICS_NONE) ? page_stats.data() + num_pages
                                                               : nullptr,
@@ -2152,7 +2108,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
       }
     }
 
-    for (int r = batch_r_start; r < rnext; r++) {
+    for (int r = 0; r < num_rowgroups; r++) {
       int p           = rg_to_part[r];
       int global_r    = global_rowgroup_base[p] + r - first_rg_in_part[p];
       auto& row_group = agg_meta->file(p).row_groups[global_r];
@@ -2192,7 +2148,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
       auto h_def_ptr = h_def_histogram.data();
       auto h_rep_ptr = h_rep_histogram.data();
 
-      for (int r = batch_r_start; r < rnext; r++) {
+      for (int r = 0; r < num_rowgroups; r++) {
         int const p        = rg_to_part[r];
         int const global_r = global_rowgroup_base[p] + r - first_rg_in_part[p];
         auto& row_group    = agg_meta->file(p).row_groups[global_r];
@@ -2239,8 +2195,6 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         }
       }
     }
-
-    batch_r_start = rnext;
   }
 
   auto bounce_buffer =
@@ -2251,7 +2205,6 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                     std::move(chunks),
                     std::move(global_rowgroup_base),
                     std::move(first_rg_in_part),
-                    std::move(batch_list),
                     std::move(rg_to_part),
                     std::move(comp_stats),
                     std::move(uncomp_bfr),
@@ -2358,7 +2311,6 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
                          chunks,
                          global_rowgroup_base,
                          first_rg_in_part,
-                         batch_list,
                          rg_to_part,
                          comp_stats,
                          uncomp_bfr,   // unused, but contains data for later write to sink
@@ -2402,7 +2354,6 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
                              chunks,
                              global_rowgroup_base,
                              first_rg_in_part,
-                             batch_list,
                              rg_to_part,
                              bounce_buffer);
 
@@ -2417,18 +2368,17 @@ void writer::impl::write_parquet_data_to_sink(
   host_2dspan<EncColumnChunk const> chunks,
   host_span<size_t const> global_rowgroup_base,
   host_span<int const> first_rg_in_part,
-  host_span<size_type const> batch_list,
   host_span<int const> rg_to_part,
   host_span<uint8_t> bounce_buffer)
 {
-  _agg_meta              = std::move(updated_agg_meta);
-  auto const num_columns = chunks.size().second;
+  _agg_meta                = std::move(updated_agg_meta);
+  auto const num_rowgroups = chunks.size().first;
+  auto const num_columns   = chunks.size().second;
 
-  for (auto b = 0, r = 0; b < static_cast<size_type>(batch_list.size()); b++) {
-    auto const rnext = r + batch_list[b];
+  if (num_rowgroups != 0) {
     std::vector<std::future<void>> write_tasks;
 
-    for (; r < rnext; r++) {
+    for (auto r = 0; r < static_cast<int>(num_rowgroups); r++) {
       int const p        = rg_to_part[r];
       int const global_r = global_rowgroup_base[p] + r - first_rg_in_part[p];
       auto& row_group    = _agg_meta->file(p).row_groups[global_r];
@@ -2472,10 +2422,9 @@ void writer::impl::write_parquet_data_to_sink(
     auto const h_pages = cudf::detail::make_host_vector_sync(pages, _stream);
 
     // add column and offset indexes to metadata
-    for (auto b = 0, r = 0; b < static_cast<size_type>(batch_list.size()); b++) {
-      auto const rnext   = r + batch_list[b];
-      auto curr_page_idx = chunks[r][0].first_page;
-      for (; r < rnext; r++) {
+    if (num_rowgroups != 0) {
+      auto curr_page_idx = chunks[0][0].first_page;
+      for (auto r = 0; r < static_cast<int>(num_rowgroups); r++) {
         int const p           = rg_to_part[r];
         int const global_r    = global_rowgroup_base[p] + r - first_rg_in_part[p];
         auto const& row_group = _agg_meta->file(p).row_groups[global_r];
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 2f6608b0ae7..3cbb7630fab 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -129,7 +129,6 @@ class writer::impl {
    * @param chunks Column chunks
    * @param global_rowgroup_base Numbers of rowgroups in each file/partition
    * @param first_rg_in_part The first rowgroup in each partition
-   * @param batch_list The batches of rowgroups to encode
    * @param rg_to_part A map from rowgroup to partition
    * @param[out] bounce_buffer Temporary host output buffer
    */
@@ -138,7 +137,6 @@ class writer::impl {
                                   host_2dspan<EncColumnChunk const> chunks,
                                   host_span<size_t const> global_rowgroup_base,
                                   host_span<int const> first_rg_in_part,
-                                  host_span<size_type const> batch_list,
                                   host_span<int const> rg_to_part,
                                   host_span<uint8_t> bounce_buffer);
 

From c32274f3c869ae054df8e588375be6dd852e7161 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 16 Apr 2024 13:11:39 -0500
Subject: [PATCH 079/272] Remove checks dependency from static-configure test
 job. (#15542)

The `static-configure` CI job had an erroneous dependency on `checks`. That job exists for PRs but not nightly tests, and caused nightly failures.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15542
---
 .github/workflows/test.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index c5ae2f3b5a8..170f45e23fd 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -44,7 +44,6 @@ jobs:
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
-    needs: checks
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:

From f0be36bedd9a7d7c03d4b90666136070d650f22c Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 16 Apr 2024 13:54:39 -0500
Subject: [PATCH 080/272] Switch back to 24.06 branch for pandas tests (#15543)

This PR switches the `custom-build` shared workflow branch back to `branch-24.06` which now contains the necessary upload artifact changes.

Authors:
   - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
   - AJ Schmidt (https://github.com/ajschmidt8)
---
 .github/workflows/pr.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index f84b1f42928..f9d5976f1fe 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -182,7 +182,7 @@ jobs:
   pandas-tests-diff:
     # diff the results of running the Pandas unit tests and publish a job summary
     needs: pandas-tests
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@patch-1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
         node_type: cpu4
         build_type: pull-request

From b9d9af16df48b6e9f7c72cc10d1462210105c285 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 16 Apr 2024 16:53:41 -0400
Subject: [PATCH 081/272] Extend cudf devcontainers to specify jitify2 kernel
 cache (#15068)

This ensures that inside devcontainers, the helper clean commands will also remvoe the jitify2 cache as it is part of the build directory.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15068
---
 .devcontainer/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index c19bb68986f..8190b5d0297 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -33,3 +33,4 @@ ENV SCCACHE_REGION="us-east-2"
 ENV SCCACHE_BUCKET="rapids-sccache-devs"
 ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai"
 ENV HISTFILE="/home/coder/.cache/._bash_history"
+ENV LIBCUDF_KERNEL_CACHE_PATH="/home/coder/cudf/cpp/build/${PYTHON_PACKAGE_MANAGER}/cuda-${CUDA_VERSION}/latest/jitify_cache"

From 690e55807925bf1a69e0ab4932723dc204e53bdd Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Tue, 16 Apr 2024 14:06:19 -0700
Subject: [PATCH 082/272] Fix for some compiler warnings in
 parquet/page_decode.cuh (#15518)

Clangd generates several warnings/errors in cpp/src/io/parquet/page_decode.cuh. One is in regards to a lambda argument shadowing a captured value. The others involve the use of `thrust::optional::value()` in device code...unlike pretty much every other member function, `value()` lacks the `__device__` decorator. This PR replaces two usages of `value()` with `operator*()` which does have the `__device__` decorator.

Authors:
  - Ed Seidl (https://github.com/etseidl)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15518
---
 cpp/src/io/parquet/page_decode.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index fa1de5f301d..83bf7fb0d73 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -924,7 +924,7 @@ inline __device__ uint32_t InitLevelSection(page_state_s* s,
 
   auto start = cur;
 
-  auto init_rle = [s, lvl, end, level_bits](uint8_t const* cur, uint8_t const* end) {
+  auto init_rle = [s, lvl, level_bits](uint8_t const* cur, uint8_t const* end) {
     uint32_t const run      = get_vlq32(cur, end);
     s->initial_rle_run[lvl] = run;
     if (!(run & 1)) {
@@ -1160,7 +1160,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
             int32_t units = 0;
             // Duration types are not included because no scaling is done when reading
             if (s->col.logical_type.has_value()) {
-              auto const& lt = s->col.logical_type.value();
+              auto const& lt = *s->col.logical_type;
               if (lt.is_timestamp_millis()) {
                 units = cudf::timestamp_ms::period::den;
               } else if (lt.is_timestamp_micros()) {
@@ -1217,7 +1217,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
       } else if (data_type == INT32) {
         // check for smaller bitwidths
         if (s->col.logical_type.has_value()) {
-          auto const& lt = s->col.logical_type.value();
+          auto const& lt = *s->col.logical_type;
           if (lt.type == LogicalType::INTEGER) {
             s->dtype_len = lt.bit_width() / 8;
           } else if (lt.is_time_millis()) {

From b378b13560165c476ab730fb53638b67dbc469fa Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 16 Apr 2024 11:57:07 -1000
Subject: [PATCH 083/272] Enable more ignored pandas unit tests for cudf.pandas
 (#15535)

If these test actually crash pytest workers, will add to `TEST_THAT_CRASH_PYTEST_WORKERS`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15535
---
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index e21c4572e44..07ec5c8bc0c 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -138,7 +138,7 @@ and not test_eof_states"
 # TODO: Remove "not db" once a postgres & mysql container is set up on the CI
 PANDAS_CI="1" timeout 30m python -m pytest -p cudf.pandas \
     -v -m "not single_cpu and not db" \
-    -k "not test_to_parquet_gcs_new_file and not test_qcut_nat and not test_add and not test_ismethods and $TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS" \
+    -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS" \
     --import-mode=importlib \
     ${PYTEST_IGNORES} \
     "$@" || [ $? = 1 ]  # Exit success if exit code was 1 (permit test failures but not other errors)

From 02f8e2fc882ae58cc74053ea631e27ab27dfbe53 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 16 Apr 2024 21:19:32 -0400
Subject: [PATCH 084/272] Fea/move to latest nanoarrow (#15526)

Move to the latest nightly build of nano arrow so that we don't need to keep patches around for it.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15526
---
 cpp/cmake/thirdparty/get_nanoarrow.cmake      |  13 +-
 .../thirdparty/patches/nanoarrow_cmake.diff   | 184 ------------------
 .../patches/nanoarrow_override.json           |  18 --
 3 files changed, 6 insertions(+), 209 deletions(-)
 delete mode 100644 cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff
 delete mode 100644 cpp/cmake/thirdparty/patches/nanoarrow_override.json

diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake
index dc0b8d09746..025bff7d8f0 100644
--- a/cpp/cmake/thirdparty/get_nanoarrow.cmake
+++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake
@@ -14,16 +14,15 @@
 
 # This function finds nanoarrow and sets any additional necessary environment variables.
 function(find_and_configure_nanoarrow)
-  include(${rapids-cmake-dir}/cpm/package_override.cmake)
-
-  set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
-  rapids_cpm_package_override("${cudf_patch_dir}/nanoarrow_override.json")
-
-  # The git_repo and git_tag are provided by the nanoarrow_override file
+  # Currently we need to always build nanoarrow so we don't pickup a previous installed version
+  set(CPM_DOWNLOAD_nanoarrow ON)
   rapids_cpm_find(
-    nanoarrow 0.4.0
+    nanoarrow 0.5.0
     GLOBAL_TARGETS nanoarrow
     CPM_ARGS
+    GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git
+    GIT_TAG 11e73a8c85b45e3d49c8c541b4e1497a649fe03c
+    GIT_SHALLOW FALSE
     OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf"
   )
   set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff b/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff
deleted file mode 100644
index 1262a38c0a4..00000000000
--- a/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff
+++ /dev/null
@@ -1,184 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 8714c70..6a9e505 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -49,7 +49,6 @@ else()
- endif()
-
- option(NANOARROW_CODE_COVERAGE "Enable coverage reporting" OFF)
--add_library(coverage_config INTERFACE)
-
- # Avoids a warning about timestamps on downloaded files (prefer new policy
- # if available))
-@@ -59,6 +58,7 @@ endif()
-
- configure_file(src/nanoarrow/nanoarrow_config.h.in generated/nanoarrow_config.h)
-
-+include(GNUInstallDirs)
- if(NANOARROW_BUNDLE)
-   # Combine all headers into amalgamation/nanoarrow.h in the build directory
-   file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/amalgamation)
-@@ -111,6 +111,8 @@ if(NANOARROW_BUNDLE)
-   if(NANOARROW_BUILD_TESTS)
-     include_directories(${CMAKE_BINARY_DIR}/amalgamation)
-     add_library(nanoarrow ${NANOARROW_C_TEMP})
-+    add_library(nanoarrow::nanoarrow ALIAS nanoarrow)
-+
-     target_compile_definitions(nanoarrow PUBLIC "$<$<CONFIG:Debug>:NANOARROW_DEBUG>")
-   endif()
-
-@@ -120,10 +122,11 @@ if(NANOARROW_BUNDLE)
- else()
-   add_library(nanoarrow src/nanoarrow/array.c src/nanoarrow/schema.c
-                         src/nanoarrow/array_stream.c src/nanoarrow/utils.c)
-+  add_library(nanoarrow::nanoarrow ALIAS nanoarrow)
-
-   target_include_directories(nanoarrow
-                              PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src>
--                                    $<INSTALL_INTERFACE:include>)
-+                                    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
-   target_include_directories(nanoarrow
-                              PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/generated>
-   )
-@@ -154,13 +157,49 @@ else()
-     endif()
-   endif()
-
--  install(TARGETS nanoarrow DESTINATION lib)
-+  install(TARGETS nanoarrow
-+          DESTINATION "${CMAKE_INSTALL_LIBDIR}"
-+          EXPORT nanoarrow-exports)
-   install(DIRECTORY src/
--          DESTINATION include
-+          DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
-           FILES_MATCHING
--          PATTERN "*.h")
-+          PATTERN "*.h*")
-   install(FILES ${CMAKE_CURRENT_BINARY_DIR}/generated/nanoarrow_config.h
--          DESTINATION include/nanoarrow)
-+          DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nanoarrow")
-+
-+  # Generate package files for the build and install trees.
-+  include(CMakePackageConfigHelpers)
-+
-+  foreach(tree_type BUILD INSTALL)
-+    if(tree_type STREQUAL "BUILD")
-+      set(install_location ".")
-+    else()
-+      set(install_location "${CMAKE_INSTALL_LIBDIR}/cmake/nanoarrow")
-+    endif()
-+
-+    set(build_location "${PROJECT_BINARY_DIR}/${install_location}")
-+    write_basic_package_version_file(
-+      "${build_location}/nanoarrow-config-version.cmake"
-+      VERSION ${nanoarrow_VERSION}
-+      # After 1.0.0, we can use `SameMajorVersion` here.
-+      COMPATIBILITY ExactVersion)
-+    configure_package_config_file("${CMAKE_CURRENT_LIST_DIR}/cmake/config.cmake.in"
-+                                  "${build_location}/nanoarrow-config.cmake"
-+                                  INSTALL_DESTINATION "${install_location}")
-+
-+    if(tree_type STREQUAL "BUILD")
-+      export(EXPORT nanoarrow-exports
-+             FILE "${build_location}/nanoarrow-targets.cmake"
-+             NAMESPACE nanoarrow::)
-+
-+    else()
-+      install(DIRECTORY "${build_location}/" DESTINATION "${install_location}")
-+      install(EXPORT nanoarrow-exports
-+              DESTINATION "${install_location}"
-+              FILE "nanoarrow-targets.cmake"
-+              NAMESPACE nanoarrow::)
-+    endif()
-+  endforeach()
- endif()
-
- # Always build integration test if building tests
-@@ -171,7 +210,7 @@ if(NANOARROW_BUILD_TESTS OR NANOARROW_BUILD_INTEGRATION_TESTS)
-               src/nanoarrow/integration/c_data_integration.cc)
-   target_include_directories(nanoarrow_c_data_integration
-                              PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src>
--                                    $<INSTALL_INTERFACE:include>)
-+                                    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
-   target_link_libraries(nanoarrow_c_data_integration PRIVATE nanoarrow nlohmann_json)
- endif()
-
-@@ -215,34 +254,18 @@ if(NANOARROW_BUILD_TESTS)
-                  src/nanoarrow/integration/c_data_integration_test.cc)
-
-   if(NANOARROW_CODE_COVERAGE)
--    target_compile_options(coverage_config INTERFACE -O0 -g --coverage)
--    target_link_options(coverage_config INTERFACE --coverage)
--    target_link_libraries(nanoarrow coverage_config)
-+    target_compile_options(nanoarrow PUBLIC -O0 -g --coverage)
-+    target_link_options(nanoarrow PUBLIC --coverage)
-   endif()
-
--  target_link_libraries(utils_test
--                        nanoarrow
--                        gtest_main
--                        ${NANOARROW_ARROW_TARGET}
--                        coverage_config)
--  target_link_libraries(buffer_test nanoarrow gtest_main coverage_config)
--  target_link_libraries(array_test
--                        nanoarrow
--                        gtest_main
--                        ${NANOARROW_ARROW_TARGET}
--                        coverage_config)
--  target_link_libraries(schema_test
--                        nanoarrow
--                        gtest_main
--                        ${NANOARROW_ARROW_TARGET}
--                        coverage_config)
--  target_link_libraries(array_stream_test nanoarrow gtest_main coverage_config)
--  target_link_libraries(nanoarrow_hpp_test nanoarrow gtest_main coverage_config)
--  target_link_libraries(nanoarrow_testing_test
--                        nanoarrow
--                        gtest_main
--                        nlohmann_json::nlohmann_json
--                        coverage_config)
-+  target_link_libraries(utils_test nanoarrow gtest_main ${NANOARROW_ARROW_TARGET})
-+  target_link_libraries(buffer_test nanoarrow gtest_main)
-+  target_link_libraries(array_test nanoarrow gtest_main ${NANOARROW_ARROW_TARGET})
-+  target_link_libraries(schema_test nanoarrow gtest_main ${NANOARROW_ARROW_TARGET})
-+  target_link_libraries(array_stream_test nanoarrow gtest_main)
-+  target_link_libraries(nanoarrow_hpp_test nanoarrow gtest_main)
-+  target_link_libraries(nanoarrow_testing_test nanoarrow gtest_main
-+                        nlohmann_json::nlohmann_json)
-   target_link_libraries(c_data_integration_test nanoarrow nanoarrow_c_data_integration
-                         gtest_main)
-
-diff --git a/cmake/config.cmake.in b/cmake/config.cmake.in
-new file mode 100644
-index 0000000..021dc31
---- /dev/null
-+++ b/cmake/config.cmake.in
-@@ -0,0 +1,28 @@
-+# Licensed to the Apache Software Foundation (ASF) under one
-+# or more contributor license agreements.  See the NOTICE file
-+# distributed with this work for additional information
-+# regarding copyright ownership.  The ASF licenses this file
-+# to you under the Apache License, Version 2.0 (the
-+# "License"); you may not use this file except in compliance
-+# with the License.  You may obtain a copy of the License at
-+#
-+# http://www.apache.org/licenses/LICENSE-2.0
-+#
-+# Unless required by applicable law or agreed to in writing,
-+# software distributed under the License is distributed on an
-+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-+# KIND, either express or implied.  See the License for the
-+# specific language governing permissions and limitations
-+# under the License.
-+
-+
-+@PACKAGE_INIT@
-+
-+cmake_minimum_required(VERSION @CMAKE_MINIMUM_REQUIRED_VERSION@)
-+
-+include("${CMAKE_CURRENT_LIST_DIR}/nanoarrow-targets.cmake" REQUIRED)
-+include("${CMAKE_CURRENT_LIST_DIR}/nanoarrow-config-version.cmake" REQUIRED)
-+
-+set(${CMAKE_FIND_PACKAGE_NAME}_CONFIG "${CMAKE_CURRENT_LIST_FILE}")
-+include(FindPackageHandleStandardArgs)
-+find_package_handle_standard_args(${CMAKE_FIND_PACKAGE_NAME} CONFIG_MODE)
diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_override.json b/cpp/cmake/thirdparty/patches/nanoarrow_override.json
deleted file mode 100644
index 0b83d1808cb..00000000000
--- a/cpp/cmake/thirdparty/patches/nanoarrow_override.json
+++ /dev/null
@@ -1,18 +0,0 @@
-
-{
-  "packages" : {
-    "nanoarrow" : {
-      "version" : "0.4.0",
-      "git_url" : "https://github.com/apache/arrow-nanoarrow.git",
-      "git_tag" : "c97720003ff863b81805bcdb9f7c91306ab6b6a8",
-      "git_shallow" : false,
-      "patches" : [
-        {
-          "file" : "${current_json_dir}/nanoarrow_cmake.diff",
-          "issue" : "Fix add support for global setup to initialize RMM in nvbench [https://github.com/NVIDIA/nvbench/pull/123]",
-          "fixed_in" : "0.5.0"
-        }
-      ]
-    }
-  }
-}

From 9192d259633c382c6f98f956dc7f43d754ebbf44 Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Wed, 17 Apr 2024 22:21:38 +1000
Subject: [PATCH 085/272] Convert libcudf resource parameters to
 rmm::device_async_resource_ref (#15507)

Closes https://github.com/rapidsai/cudf/issues/15498

For reviewers:
Almost all of the thousands of changes are simple textual replace of `rmm::mr::device_memory_resource *` with `rmm::device_async_resource_ref`.

I think the only substantial changes that are different are in `contiguous_split.cu` (which was assigning `nullptr` to the MR pointer -- I have changed these cases to use a `std::optional<rmm::device_async_resource_ref>`), and in JNI code.

~I still need to figure out how to build and test the JNI bindings. And figure out necessary Cython changes.~

JNI is passing CI now. Cython required no changes.

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15507
---
 cpp/benchmarks/copying/shift.cu               |  10 +-
 cpp/benchmarks/fixture/benchmark_fixture.hpp  |   1 +
 .../cudf/ast/detail/expression_parser.hpp     |  10 +-
 cpp/include/cudf/binaryop.hpp                 |  23 +--
 cpp/include/cudf/column/column.hpp            |   9 +-
 cpp/include/cudf/column/column_factories.hpp  |  75 ++++----
 cpp/include/cudf/concatenate.hpp              |  15 +-
 cpp/include/cudf/contiguous_split.hpp         |  10 +-
 cpp/include/cudf/copying.hpp                  |  77 ++++----
 cpp/include/cudf/datetime.hpp                 |  43 ++---
 cpp/include/cudf/detail/binaryop.hpp          |  19 +-
 .../detail/calendrical_month_sequence.cuh     |   5 +-
 cpp/include/cudf/detail/concatenate.hpp       |  11 +-
 cpp/include/cudf/detail/concatenate_masks.hpp |   7 +-
 cpp/include/cudf/detail/contiguous_split.hpp  |   7 +-
 cpp/include/cudf/detail/copy.hpp              |  35 ++--
 cpp/include/cudf/detail/copy_if.cuh           |   7 +-
 cpp/include/cudf/detail/copy_if_else.cuh      |   3 +-
 cpp/include/cudf/detail/copy_range.cuh        |   3 +-
 cpp/include/cudf/detail/datetime.hpp          |  66 +++----
 .../cudf/detail/distinct_hash_join.cuh        |   5 +-
 cpp/include/cudf/detail/fill.hpp              |   5 +-
 cpp/include/cudf/detail/gather.cuh            |  17 +-
 cpp/include/cudf/detail/gather.hpp            |   9 +-
 cpp/include/cudf/detail/groupby.hpp           |   5 +-
 .../detail/groupby/group_replace_nulls.hpp    |   5 +-
 .../cudf/detail/groupby/sort_helper.hpp       |  11 +-
 .../cudf/detail/hash_reduce_by_row.cuh        |   3 +-
 cpp/include/cudf/detail/interop.hpp           |  14 +-
 cpp/include/cudf/detail/join.hpp              |  13 +-
 cpp/include/cudf/detail/label_bins.hpp        |   5 +-
 cpp/include/cudf/detail/merge.hpp             |   7 +-
 cpp/include/cudf/detail/null_mask.cuh         |  11 +-
 cpp/include/cudf/detail/null_mask.hpp         |  25 +--
 cpp/include/cudf/detail/quantiles.hpp         |  11 +-
 cpp/include/cudf/detail/repeat.hpp            |  11 +-
 cpp/include/cudf/detail/replace.hpp           |  27 +--
 cpp/include/cudf/detail/reshape.hpp           |   7 +-
 cpp/include/cudf/detail/rolling.hpp           |   7 +-
 cpp/include/cudf/detail/round.hpp             |   7 +-
 cpp/include/cudf/detail/scan.hpp              |  13 +-
 cpp/include/cudf/detail/scatter.cuh           |  17 +-
 cpp/include/cudf/detail/scatter.hpp           |  19 +-
 cpp/include/cudf/detail/search.hpp            |  15 +-
 cpp/include/cudf/detail/sequence.hpp          |  13 +-
 .../cudf/detail/sizes_to_offsets_iterator.cuh |   3 +-
 cpp/include/cudf/detail/sorting.hpp           |  23 +--
 cpp/include/cudf/detail/stream_compaction.hpp |  21 ++-
 cpp/include/cudf/detail/structs/utilities.hpp |  11 +-
 cpp/include/cudf/detail/tdigest/tdigest.hpp   |  17 +-
 cpp/include/cudf/detail/timezone.hpp          |   7 +-
 cpp/include/cudf/detail/transform.hpp         |  19 +-
 cpp/include/cudf/detail/transpose.hpp         |   5 +-
 cpp/include/cudf/detail/unary.hpp             |  15 +-
 .../detail/utilities/vector_factories.hpp     |  23 +--
 cpp/include/cudf/detail/valid_if.cuh          |   3 +-
 .../cudf/dictionary/detail/concatenate.hpp    |   5 +-
 cpp/include/cudf/dictionary/detail/encode.hpp |   7 +-
 cpp/include/cudf/dictionary/detail/merge.hpp  |   5 +-
 .../cudf/dictionary/detail/replace.hpp        |   7 +-
 cpp/include/cudf/dictionary/detail/search.hpp |   9 +-
 .../cudf/dictionary/detail/update_keys.hpp    |  17 +-
 .../cudf/dictionary/dictionary_factories.hpp  |  11 +-
 cpp/include/cudf/dictionary/encode.hpp        |  13 +-
 cpp/include/cudf/dictionary/search.hpp        |   7 +-
 cpp/include/cudf/dictionary/update_keys.hpp   |  23 +--
 cpp/include/cudf/filling.hpp                  |  27 +--
 cpp/include/cudf/groupby.hpp                  |  25 +--
 cpp/include/cudf/hashing.hpp                  |  43 ++---
 cpp/include/cudf/hashing/detail/hashing.hpp   |  19 +-
 cpp/include/cudf/interop.hpp                  |  22 ++-
 cpp/include/cudf/io/avro.hpp                  |   5 +-
 cpp/include/cudf/io/csv.hpp                   |  11 +-
 cpp/include/cudf/io/detail/avro.hpp           |   5 +-
 cpp/include/cudf/io/detail/csv.hpp            |   7 +-
 cpp/include/cudf/io/detail/json.hpp           |   9 +-
 cpp/include/cudf/io/detail/orc.hpp            |   3 +-
 cpp/include/cudf/io/detail/parquet.hpp        |   5 +-
 cpp/include/cudf/io/detail/tokenize_json.hpp  |   5 +-
 cpp/include/cudf/io/json.hpp                  |   9 +-
 cpp/include/cudf/io/orc.hpp                   |   5 +-
 cpp/include/cudf/io/parquet.hpp               |  13 +-
 .../cudf/io/text/detail/tile_state.hpp        |   4 +-
 cpp/include/cudf/io/text/detail/trie.hpp      |   7 +-
 cpp/include/cudf/io/text/multibyte_split.hpp  |  11 +-
 cpp/include/cudf/join.hpp                     |  90 ++++-----
 cpp/include/cudf/json/json.hpp                |   9 +-
 cpp/include/cudf/labeling/label_bins.hpp      |   5 +-
 cpp/include/cudf/lists/combine.hpp            |   7 +-
 cpp/include/cudf/lists/contains.hpp           |  27 +--
 cpp/include/cudf/lists/count_elements.hpp     |   7 +-
 cpp/include/cudf/lists/detail/combine.hpp     |   8 +-
 cpp/include/cudf/lists/detail/concatenate.hpp |   5 +-
 cpp/include/cudf/lists/detail/contains.hpp    |  20 +-
 cpp/include/cudf/lists/detail/copying.hpp     |   5 +-
 cpp/include/cudf/lists/detail/extract.hpp     |  12 +-
 cpp/include/cudf/lists/detail/gather.cuh      |  15 +-
 .../cudf/lists/detail/interleave_columns.hpp  |   5 +-
 .../lists/detail/lists_column_factories.hpp   |   9 +-
 cpp/include/cudf/lists/detail/reverse.hpp     |   6 +-
 cpp/include/cudf/lists/detail/scatter.cuh     |   9 +-
 .../cudf/lists/detail/scatter_helper.cuh      |   5 +-
 .../cudf/lists/detail/set_operations.hpp      |   9 +-
 cpp/include/cudf/lists/detail/sorting.hpp     |   7 +-
 .../cudf/lists/detail/stream_compaction.hpp   |   9 +-
 cpp/include/cudf/lists/explode.hpp            |  11 +-
 cpp/include/cudf/lists/extract.hpp            |  11 +-
 cpp/include/cudf/lists/filling.hpp            |  11 +-
 cpp/include/cudf/lists/gather.hpp             |   9 +-
 cpp/include/cudf/lists/reverse.hpp            |   7 +-
 cpp/include/cudf/lists/set_operations.hpp     |  35 ++--
 cpp/include/cudf/lists/sorting.hpp            |  11 +-
 cpp/include/cudf/lists/stream_compaction.hpp  |  15 +-
 cpp/include/cudf/merge.hpp                    |   5 +-
 cpp/include/cudf/null_mask.hpp                |  23 +--
 cpp/include/cudf/partitioning.hpp             |  15 +-
 cpp/include/cudf/quantiles.hpp                |  15 +-
 cpp/include/cudf/reduction.hpp                |  17 +-
 .../cudf/reduction/detail/histogram.hpp       |   5 +-
 .../cudf/reduction/detail/reduction.cuh       |   9 +-
 .../cudf/reduction/detail/reduction.hpp       |   8 +-
 .../reduction/detail/reduction_functions.hpp  |  37 ++--
 .../detail/segmented_reduction_functions.hpp  |  25 +--
 cpp/include/cudf/replace.hpp                  |  39 ++--
 cpp/include/cudf/reshape.hpp                  |   9 +-
 cpp/include/cudf/rolling.hpp                  |  33 ++--
 cpp/include/cudf/round.hpp                    |   7 +-
 cpp/include/cudf/scalar/scalar.hpp            | 173 +++++++++---------
 cpp/include/cudf/scalar/scalar_factories.hpp  |  51 +++---
 cpp/include/cudf/search.hpp                   |  15 +-
 cpp/include/cudf/sorting.hpp                  |  25 +--
 cpp/include/cudf/stream_compaction.hpp        |  43 ++---
 cpp/include/cudf/strings/attributes.hpp       |   9 +-
 cpp/include/cudf/strings/capitalize.hpp       |  15 +-
 cpp/include/cudf/strings/case.hpp             |  15 +-
 .../cudf/strings/char_types/char_types.hpp    |   7 +-
 cpp/include/cudf/strings/combine.hpp          |  27 +--
 cpp/include/cudf/strings/contains.hpp         |  19 +-
 .../cudf/strings/convert/convert_booleans.hpp |  11 +-
 .../cudf/strings/convert/convert_datetime.hpp |  19 +-
 .../strings/convert/convert_durations.hpp     |  13 +-
 .../strings/convert/convert_fixed_point.hpp   |  17 +-
 .../cudf/strings/convert/convert_floats.hpp   |  15 +-
 .../cudf/strings/convert/convert_integers.hpp |  31 ++--
 .../cudf/strings/convert/convert_ipv4.hpp     |  15 +-
 .../cudf/strings/convert/convert_lists.hpp    |   5 +-
 .../cudf/strings/convert/convert_urls.hpp     |  11 +-
 cpp/include/cudf/strings/detail/combine.hpp   |  15 +-
 .../cudf/strings/detail/concatenate.hpp       |   5 +-
 .../cudf/strings/detail/converters.hpp        |  51 +++---
 .../cudf/strings/detail/copy_if_else.cuh      |   3 +-
 .../cudf/strings/detail/copy_range.hpp        |   3 +-
 cpp/include/cudf/strings/detail/copying.hpp   |   7 +-
 cpp/include/cudf/strings/detail/fill.hpp      |   5 +-
 cpp/include/cudf/strings/detail/gather.cuh    |   7 +-
 cpp/include/cudf/strings/detail/merge.cuh     |   3 +-
 cpp/include/cudf/strings/detail/replace.hpp   |  17 +-
 cpp/include/cudf/strings/detail/scan.hpp      |   5 +-
 cpp/include/cudf/strings/detail/scatter.cuh   |   3 +-
 .../cudf/strings/detail/strings_children.cuh  |   7 +-
 .../detail/strings_column_factories.cuh       |   5 +-
 cpp/include/cudf/strings/detail/utilities.hpp |   5 +-
 cpp/include/cudf/strings/extract.hpp          |  11 +-
 cpp/include/cudf/strings/find.hpp             |  49 ++---
 cpp/include/cudf/strings/find_multiple.hpp    |   7 +-
 cpp/include/cudf/strings/findall.hpp          |   7 +-
 cpp/include/cudf/strings/padding.hpp          |  15 +-
 cpp/include/cudf/strings/repeat_strings.hpp   |  15 +-
 cpp/include/cudf/strings/replace.hpp          |  23 +--
 cpp/include/cudf/strings/replace_re.hpp       |  15 +-
 cpp/include/cudf/strings/reverse.hpp          |   7 +-
 cpp/include/cudf/strings/slice.hpp            |   9 +-
 cpp/include/cudf/strings/split/partition.hpp  |  15 +-
 cpp/include/cudf/strings/split/split.hpp      |  35 ++--
 cpp/include/cudf/strings/split/split_re.hpp   |  27 +--
 cpp/include/cudf/strings/strip.hpp            |  11 +-
 cpp/include/cudf/strings/translate.hpp        |  15 +-
 cpp/include/cudf/strings/wrap.hpp             |   7 +-
 .../cudf/structs/detail/concatenate.hpp       |   6 +-
 cpp/include/cudf/structs/detail/scan.hpp      |   5 +-
 cpp/include/cudf/table/table.hpp              |  11 +-
 cpp/include/cudf/timezone.hpp                 |   5 +-
 cpp/include/cudf/transform.hpp                |  20 +-
 cpp/include/cudf/transpose.hpp                |   5 +-
 cpp/include/cudf/unary.hpp                    |  25 +--
 cpp/include/cudf_test/base_fixture.hpp        |   9 +-
 cpp/include/nvtext/byte_pair_encoding.hpp     |  18 +-
 cpp/include/nvtext/detail/generate_ngrams.hpp |   7 +-
 cpp/include/nvtext/detail/load_hash_file.hpp  |   3 +-
 cpp/include/nvtext/detail/tokenize.hpp        |  19 +-
 cpp/include/nvtext/edit_distance.hpp          |  12 +-
 cpp/include/nvtext/generate_ngrams.hpp        |  18 +-
 cpp/include/nvtext/jaccard.hpp                |   8 +-
 cpp/include/nvtext/minhash.hpp                |  20 +-
 cpp/include/nvtext/ngrams_tokenize.hpp        |   8 +-
 cpp/include/nvtext/normalize.hpp              |  12 +-
 cpp/include/nvtext/replace.hpp                |   8 +-
 cpp/include/nvtext/stemmer.hpp                |  16 +-
 cpp/include/nvtext/subword_tokenize.hpp       |   8 +-
 cpp/include/nvtext/tokenize.hpp               |  36 ++--
 cpp/src/binaryop/binaryop.cpp                 |  29 +--
 cpp/src/binaryop/compiled/binary_ops.cu       |  19 +-
 cpp/src/binaryop/compiled/binary_ops.hpp      |  15 +-
 cpp/src/bitmask/null_mask.cu                  |  25 +--
 cpp/src/column/column.cu                      |   9 +-
 cpp/src/column/column_factories.cpp           |  16 +-
 cpp/src/column/column_factories.cu            |  16 +-
 cpp/src/copying/concatenate.cu                |  19 +-
 cpp/src/copying/contiguous_split.cu           |  49 ++---
 cpp/src/copying/copy.cpp                      |   7 +-
 cpp/src/copying/copy.cu                       |  39 ++--
 cpp/src/copying/copy_range.cu                 |  11 +-
 cpp/src/copying/gather.cu                     |   7 +-
 cpp/src/copying/get_element.cu                |  17 +-
 cpp/src/copying/pack.cpp                      |   7 +-
 cpp/src/copying/purge_nonempty_nulls.cu       |   8 +-
 cpp/src/copying/reverse.cu                    |   9 +-
 cpp/src/copying/sample.cu                     |   5 +-
 cpp/src/copying/scatter.cu                    |  37 ++--
 cpp/src/copying/segmented_shift.cu            |  11 +-
 cpp/src/copying/shift.cu                      |  11 +-
 cpp/src/datetime/datetime_ops.cu              |  89 +++++----
 cpp/src/datetime/timezone.cpp                 |   6 +-
 cpp/src/dictionary/add_keys.cu                |   7 +-
 cpp/src/dictionary/decode.cu                  |   7 +-
 cpp/src/dictionary/detail/concatenate.cu      |   5 +-
 cpp/src/dictionary/detail/merge.cu            |   5 +-
 cpp/src/dictionary/dictionary_factories.cu    |  11 +-
 cpp/src/dictionary/encode.cu                  |   7 +-
 cpp/src/dictionary/remove_keys.cu             |  13 +-
 cpp/src/dictionary/replace.cu                 |  13 +-
 cpp/src/dictionary/search.cu                  |  19 +-
 cpp/src/dictionary/set_keys.cu                |  15 +-
 cpp/src/filling/calendrical_month_sequence.cu |   7 +-
 cpp/src/filling/fill.cu                       |  13 +-
 cpp/src/filling/repeat.cu                     |   9 +-
 cpp/src/filling/sequence.cu                   |  15 +-
 cpp/src/groupby/common/utils.hpp              |   6 +-
 cpp/src/groupby/groupby.cu                    |  17 +-
 cpp/src/groupby/hash/groupby.cu               |  11 +-
 cpp/src/groupby/sort/aggregate.cpp            |   3 +-
 cpp/src/groupby/sort/functors.hpp             |   9 +-
 cpp/src/groupby/sort/group_argmax.cu          |   3 +-
 cpp/src/groupby/sort/group_argmin.cu          |   3 +-
 cpp/src/groupby/sort/group_collect.cu         |   7 +-
 cpp/src/groupby/sort/group_correlation.cu     |   5 +-
 cpp/src/groupby/sort/group_count.cu           |   5 +-
 cpp/src/groupby/sort/group_count_scan.cu      |   3 +-
 cpp/src/groupby/sort/group_histogram.cu       |   7 +-
 cpp/src/groupby/sort/group_m2.cu              |   7 +-
 cpp/src/groupby/sort/group_max.cu             |   3 +-
 cpp/src/groupby/sort/group_max_scan.cu        |   3 +-
 cpp/src/groupby/sort/group_merge_lists.cu     |   5 +-
 cpp/src/groupby/sort/group_merge_m2.cu        |   5 +-
 cpp/src/groupby/sort/group_min.cu             |   3 +-
 cpp/src/groupby/sort/group_min_scan.cu        |   3 +-
 cpp/src/groupby/sort/group_nth_element.cu     |   3 +-
 cpp/src/groupby/sort/group_nunique.cu         |   5 +-
 cpp/src/groupby/sort/group_product.cu         |   3 +-
 cpp/src/groupby/sort/group_product_scan.cu    |   3 +-
 cpp/src/groupby/sort/group_quantiles.cu       |   5 +-
 cpp/src/groupby/sort/group_rank_scan.cu       |  17 +-
 cpp/src/groupby/sort/group_reductions.hpp     |  43 ++---
 cpp/src/groupby/sort/group_replace_nulls.cu   |   5 +-
 cpp/src/groupby/sort/group_scan.hpp           |  29 +--
 cpp/src/groupby/sort/group_scan_util.cuh      |   9 +-
 .../sort/group_single_pass_reduction_util.cuh |   7 +-
 cpp/src/groupby/sort/group_std.cu             |   7 +-
 cpp/src/groupby/sort/group_sum.cu             |   3 +-
 cpp/src/groupby/sort/group_sum_scan.cu        |   3 +-
 cpp/src/groupby/sort/scan.cpp                 |   3 +-
 cpp/src/groupby/sort/sort_helper.cu           |   9 +-
 cpp/src/hash/md5_hash.cu                      |   5 +-
 cpp/src/hash/murmurhash3_x64_128.cu           |   7 +-
 cpp/src/hash/murmurhash3_x86_32.cu            |   7 +-
 cpp/src/hash/sha1_hash.cu                     |   5 +-
 cpp/src/hash/sha224_hash.cu                   |   5 +-
 cpp/src/hash/sha256_hash.cu                   |   5 +-
 cpp/src/hash/sha384_hash.cu                   |   5 +-
 cpp/src/hash/sha512_hash.cu                   |   5 +-
 cpp/src/hash/sha_hash.cuh                     |   3 +-
 cpp/src/hash/xxhash_64.cu                     |   7 +-
 cpp/src/interop/dlpack.cpp                    |   9 +-
 cpp/src/interop/from_arrow.cu                 |  40 ++--
 cpp/src/interop/to_arrow_device.cu            |  34 ++--
 cpp/src/io/avro/reader_impl.cu                |   5 +-
 cpp/src/io/csv/durations.cu                   |   7 +-
 cpp/src/io/csv/durations.hpp                  |   5 +-
 cpp/src/io/csv/reader_impl.cu                 |   7 +-
 cpp/src/io/csv/writer_impl.cu                 |  11 +-
 cpp/src/io/functions.cpp                      |  21 ++-
 cpp/src/io/json/json_column.cu                |   7 +-
 cpp/src/io/json/json_normalization.cu         |   5 +-
 cpp/src/io/json/json_tree.cu                  |  11 +-
 cpp/src/io/json/legacy/read_json.hpp          |   3 +-
 cpp/src/io/json/legacy/reader_impl.cu         |   5 +-
 cpp/src/io/json/nested_json.hpp               |  12 +-
 cpp/src/io/json/nested_json_gpu.cu            |   9 +-
 cpp/src/io/json/read_json.cu                  |   3 +-
 cpp/src/io/json/read_json.hpp                 |   3 +-
 cpp/src/io/json/write_json.cu                 |  15 +-
 cpp/src/io/orc/reader_impl.cu                 |   6 +-
 cpp/src/io/orc/reader_impl.hpp                |   5 +-
 cpp/src/io/orc/reader_impl_helpers.cpp        |   4 +-
 cpp/src/io/orc/reader_impl_helpers.hpp        |   3 +-
 cpp/src/io/orc/reader_impl_preprocess.cu      |   5 +-
 cpp/src/io/parquet/predicate_pushdown.cpp     |   7 +-
 cpp/src/io/parquet/reader.cpp                 |   8 +-
 cpp/src/io/parquet/reader_impl.cpp            |   6 +-
 cpp/src/io/parquet/reader_impl.hpp            |   8 +-
 cpp/src/io/text/multibyte_split.cu            |   9 +-
 cpp/src/io/utilities/column_buffer.cpp        |   9 +-
 cpp/src/io/utilities/column_buffer.hpp        |  16 +-
 cpp/src/io/utilities/data_casting.cu          |   5 +-
 cpp/src/io/utilities/output_builder.cuh       |   7 +-
 cpp/src/io/utilities/string_parsing.hpp       |   3 +-
 cpp/src/join/conditional_join.cu              |  23 +--
 cpp/src/join/conditional_join.hpp             |   7 +-
 cpp/src/join/cross_join.cu                    |   7 +-
 cpp/src/join/distinct_hash_join.cu            |  13 +-
 cpp/src/join/hash_join.cu                     |  25 +--
 cpp/src/join/join.cu                          |  15 +-
 cpp/src/join/join_common_utils.cuh            |   5 +-
 cpp/src/join/join_utils.cu                    |   7 +-
 cpp/src/join/mixed_join.cu                    |  15 +-
 cpp/src/join/mixed_join_semi.cu               |   7 +-
 cpp/src/join/semi_join.cu                     |   7 +-
 cpp/src/json/json_path.cu                     |   5 +-
 cpp/src/labeling/label_bins.cu                |   9 +-
 .../combine/concatenate_list_elements.cu      |  13 +-
 cpp/src/lists/combine/concatenate_rows.cu     |   7 +-
 cpp/src/lists/contains.cu                     |  25 +--
 cpp/src/lists/copying/concatenate.cu          |   7 +-
 cpp/src/lists/copying/copying.cu              |   5 +-
 cpp/src/lists/copying/gather.cu               |   7 +-
 cpp/src/lists/copying/scatter_helper.cu       |  14 +-
 cpp/src/lists/copying/segmented_gather.cu     |   5 +-
 cpp/src/lists/count_elements.cu               |   7 +-
 cpp/src/lists/explode.cu                      |  25 +--
 cpp/src/lists/extract.cu                      |  17 +-
 cpp/src/lists/interleave_columns.cu           |  13 +-
 cpp/src/lists/lists_column_factories.cu       |  11 +-
 cpp/src/lists/reverse.cu                      |   7 +-
 cpp/src/lists/segmented_sort.cu               |  13 +-
 cpp/src/lists/sequences.cu                    |  17 +-
 cpp/src/lists/set_operations.cu               |  17 +-
 .../stream_compaction/apply_boolean_mask.cu   |   7 +-
 cpp/src/lists/stream_compaction/distinct.cu   |   5 +-
 cpp/src/lists/utilities.cu                    |  10 +-
 cpp/src/lists/utilities.hpp                   |   9 +-
 cpp/src/merge/merge.cu                        |  19 +-
 cpp/src/partitioning/partitioning.cu          |  17 +-
 cpp/src/partitioning/round_robin.cu           |   9 +-
 cpp/src/quantiles/quantile.cu                 |   9 +-
 cpp/src/quantiles/quantiles.cu                |   7 +-
 cpp/src/quantiles/tdigest/tdigest.cu          |  13 +-
 .../quantiles/tdigest/tdigest_aggregation.cu  |  23 +--
 cpp/src/reductions/all.cu                     |   8 +-
 cpp/src/reductions/any.cu                     |   8 +-
 cpp/src/reductions/collect_ops.cu             |  12 +-
 cpp/src/reductions/compound.cuh               |  14 +-
 cpp/src/reductions/histogram.cu               |  10 +-
 cpp/src/reductions/max.cu                     |   5 +-
 cpp/src/reductions/mean.cu                    |   5 +-
 cpp/src/reductions/min.cu                     |   6 +-
 cpp/src/reductions/minmax.cu                  |  15 +-
 cpp/src/reductions/nth_element.cu             |   3 +-
 cpp/src/reductions/product.cu                 |   5 +-
 cpp/src/reductions/reductions.cpp             |  11 +-
 cpp/src/reductions/scan/rank_scan.cu          |  11 +-
 cpp/src/reductions/scan/scan.cpp              |   8 +-
 cpp/src/reductions/scan/scan.cuh              |   7 +-
 cpp/src/reductions/scan/scan_exclusive.cu     |   5 +-
 cpp/src/reductions/scan/scan_inclusive.cu     |  13 +-
 cpp/src/reductions/segmented/all.cu           |   6 +-
 cpp/src/reductions/segmented/any.cu           |   6 +-
 cpp/src/reductions/segmented/compound.cuh     |  14 +-
 cpp/src/reductions/segmented/counts.cu        |   6 +-
 cpp/src/reductions/segmented/counts.hpp       |   5 +-
 cpp/src/reductions/segmented/max.cu           |   6 +-
 cpp/src/reductions/segmented/mean.cu          |   5 +-
 cpp/src/reductions/segmented/min.cu           |   6 +-
 cpp/src/reductions/segmented/nunique.cu       |   5 +-
 cpp/src/reductions/segmented/product.cu       |   6 +-
 cpp/src/reductions/segmented/reductions.cpp   |  15 +-
 cpp/src/reductions/segmented/simple.cuh       |  31 ++--
 cpp/src/reductions/segmented/std.cu           |   5 +-
 cpp/src/reductions/segmented/sum.cu           |   6 +-
 .../reductions/segmented/sum_of_squares.cu    |   5 +-
 .../reductions/segmented/update_validity.cu   |   6 +-
 .../reductions/segmented/update_validity.hpp  |   5 +-
 cpp/src/reductions/segmented/var.cu           |   5 +-
 cpp/src/reductions/simple.cuh                 |  39 ++--
 cpp/src/reductions/std.cu                     |   5 +-
 cpp/src/reductions/sum.cu                     |   5 +-
 cpp/src/reductions/sum_of_squares.cu          |   5 +-
 cpp/src/reductions/var.cu                     |   5 +-
 cpp/src/replace/clamp.cu                      |  25 +--
 cpp/src/replace/nans.cu                       |  17 +-
 cpp/src/replace/nulls.cu                      |  31 ++--
 cpp/src/replace/replace.cu                    |  13 +-
 cpp/src/reshape/byte_cast.cu                  |  15 +-
 cpp/src/reshape/interleave_columns.cu         |  15 +-
 cpp/src/reshape/tile.cu                       |   7 +-
 cpp/src/rolling/detail/lead_lag_nested.cuh    |   3 +-
 cpp/src/rolling/detail/nth_element.cuh        |   5 +-
 .../detail/optimized_unbounded_window.cpp     |  10 +-
 .../detail/optimized_unbounded_window.hpp     |   5 +-
 cpp/src/rolling/detail/rolling.cuh            |  17 +-
 cpp/src/rolling/detail/rolling.hpp            |  12 +-
 .../rolling/detail/rolling_collect_list.cu    |   5 +-
 .../rolling/detail/rolling_collect_list.cuh   |   7 +-
 .../rolling/detail/rolling_fixed_window.cu    |   4 +-
 .../rolling/detail/rolling_variable_window.cu |   4 +-
 cpp/src/rolling/grouped_rolling.cu            |  40 ++--
 cpp/src/rolling/rolling.cu                    |  10 +-
 cpp/src/round/round.cu                        |  11 +-
 cpp/src/scalar/scalar.cpp                     |  73 ++++----
 cpp/src/scalar/scalar_factories.cpp           |  35 ++--
 cpp/src/search/contains_column.cu             |  11 +-
 cpp/src/search/contains_table.cu              |   3 +-
 cpp/src/search/search_ordered.cu              |  13 +-
 cpp/src/sort/rank.cu                          |   5 +-
 cpp/src/sort/segmented_sort.cu                |  11 +-
 cpp/src/sort/segmented_sort_impl.cuh          |   7 +-
 cpp/src/sort/sort.cu                          |  13 +-
 cpp/src/sort/sort_column.cu                   |   6 +-
 cpp/src/sort/sort_column_impl.cuh             |   3 +-
 cpp/src/sort/sort_impl.cuh                    |   7 +-
 cpp/src/sort/stable_segmented_sort.cu         |  12 +-
 cpp/src/sort/stable_sort.cu                   |  13 +-
 cpp/src/sort/stable_sort_column.cu            |   6 +-
 .../stream_compaction/apply_boolean_mask.cu   |   7 +-
 cpp/src/stream_compaction/distinct.cu         |   9 +-
 cpp/src/stream_compaction/distinct_helpers.cu |   6 +-
 .../stream_compaction/distinct_helpers.hpp    |   5 +-
 cpp/src/stream_compaction/drop_nans.cu        |   9 +-
 cpp/src/stream_compaction/drop_nulls.cu       |   9 +-
 cpp/src/stream_compaction/stable_distinct.cu  |   8 +-
 cpp/src/stream_compaction/unique.cu           |   7 +-
 cpp/src/strings/attributes.cu                 |  17 +-
 cpp/src/strings/capitalize.cu                 |  15 +-
 cpp/src/strings/case.cu                       |  15 +-
 cpp/src/strings/char_types/char_types.cu      |   9 +-
 cpp/src/strings/combine/concatenate.cu        |   9 +-
 cpp/src/strings/combine/join.cu               |   5 +-
 cpp/src/strings/combine/join_list_elements.cu |   9 +-
 cpp/src/strings/contains.cu                   |  15 +-
 cpp/src/strings/convert/convert_booleans.cu   |   9 +-
 cpp/src/strings/convert/convert_datetime.cu   |  15 +-
 cpp/src/strings/convert/convert_durations.cu  |  11 +-
 .../strings/convert/convert_fixed_point.cu    |  25 +--
 cpp/src/strings/convert/convert_floats.cu     |  17 +-
 cpp/src/strings/convert/convert_hex.cu        |  15 +-
 cpp/src/strings/convert/convert_integers.cu   |  25 +--
 cpp/src/strings/convert/convert_ipv4.cu       |  13 +-
 cpp/src/strings/convert/convert_lists.cu      |   5 +-
 cpp/src/strings/convert/convert_urls.cu       |   9 +-
 cpp/src/strings/copying/concatenate.cu        |   3 +-
 cpp/src/strings/copying/copy_range.cu         |   3 +-
 cpp/src/strings/copying/copying.cu            |   3 +-
 cpp/src/strings/copying/shift.cu              |   3 +-
 cpp/src/strings/count_matches.cu              |   4 +-
 cpp/src/strings/count_matches.hpp             |   5 +-
 cpp/src/strings/extract/extract.cu            |   5 +-
 cpp/src/strings/extract/extract_all.cu        |   5 +-
 cpp/src/strings/filling/fill.cu               |   3 +-
 cpp/src/strings/filter_chars.cu               |   5 +-
 cpp/src/strings/like.cu                       |  13 +-
 cpp/src/strings/padding.cu                    |   9 +-
 cpp/src/strings/regex/utilities.cuh           |   3 +-
 cpp/src/strings/repeat_strings.cu             |  15 +-
 cpp/src/strings/replace/backref_re.cu         |   5 +-
 cpp/src/strings/replace/find_replace.cu       |   3 +-
 cpp/src/strings/replace/multi.cu              |   9 +-
 cpp/src/strings/replace/multi_re.cu           |   5 +-
 cpp/src/strings/replace/replace.cu            |   9 +-
 cpp/src/strings/replace/replace_nulls.cu      |   3 +-
 cpp/src/strings/replace/replace_re.cu         |   5 +-
 cpp/src/strings/replace/replace_slice.cu      |   5 +-
 cpp/src/strings/reverse.cu                    |   5 +-
 cpp/src/strings/scan/scan_inclusive.cu        |   9 +-
 cpp/src/strings/search/find.cu                |  45 ++---
 cpp/src/strings/search/find_multiple.cu       |   7 +-
 cpp/src/strings/search/findall.cu             |   7 +-
 cpp/src/strings/slice.cu                      |  11 +-
 cpp/src/strings/split/partition.cu            |  11 +-
 cpp/src/strings/split/split.cu                |  15 +-
 cpp/src/strings/split/split.cuh               |   5 +-
 cpp/src/strings/split/split_re.cu             |  21 ++-
 cpp/src/strings/split/split_record.cu         |  11 +-
 cpp/src/strings/strings_column_factories.cu   |   5 +-
 cpp/src/strings/strings_scalar_factories.cpp  |   5 +-
 cpp/src/strings/strip.cu                      |   7 +-
 cpp/src/strings/translate.cu                  |   5 +-
 cpp/src/strings/utilities.cu                  |   5 +-
 cpp/src/strings/wrap.cu                       |   5 +-
 cpp/src/structs/copying/concatenate.cu        |   5 +-
 cpp/src/structs/scan/scan_inclusive.cu        |   7 +-
 cpp/src/structs/structs_column_factories.cu   |   3 +-
 cpp/src/structs/utilities.cpp                 |  18 +-
 cpp/src/table/row_operators.cu                |   9 +-
 cpp/src/table/table.cpp                       |   7 +-
 cpp/src/text/bpe/byte_pair_encoding.cu        |   5 +-
 cpp/src/text/bpe/load_merge_pairs.cu          |  11 +-
 cpp/src/text/detokenize.cu                    |   5 +-
 cpp/src/text/edit_distance.cu                 |   9 +-
 cpp/src/text/generate_ngrams.cu               |  13 +-
 cpp/src/text/jaccard.cu                       |   5 +-
 cpp/src/text/minhash.cu                       |  21 ++-
 cpp/src/text/ngrams_tokenize.cu               |   5 +-
 cpp/src/text/normalize.cu                     |   9 +-
 cpp/src/text/replace.cu                       |   9 +-
 cpp/src/text/stemmer.cu                       |  15 +-
 cpp/src/text/subword/load_hash_file.cu        |   5 +-
 cpp/src/text/subword/subword_tokenize.cu      |   7 +-
 cpp/src/text/tokenize.cu                      |  25 +--
 cpp/src/text/vocabulary_tokenize.cu           |   9 +-
 cpp/src/transform/bools_to_mask.cu            |   7 +-
 cpp/src/transform/compute_column.cu           |   5 +-
 cpp/src/transform/encode.cu                   |  10 +-
 cpp/src/transform/mask_to_bools.cu            |   5 +-
 cpp/src/transform/nans_to_nulls.cu            |  11 +-
 cpp/src/transform/one_hot_encode.cu           |   5 +-
 cpp/src/transform/row_bit_count.cu            |   9 +-
 cpp/src/transform/transform.cpp               |   5 +-
 cpp/src/transpose/transpose.cu                |   7 +-
 cpp/src/unary/cast_ops.cu                     |  21 ++-
 cpp/src/unary/math_ops.cu                     |  33 ++--
 cpp/src/unary/nan_ops.cu                      |  15 +-
 cpp/src/unary/null_ops.cu                     |  12 +-
 cpp/src/unary/unary_ops.cuh                   |   5 +-
 cpp/tests/copying/shift_tests.cpp             |   9 +-
 cpp/tests/io/json_chunked_reader.cpp          |   4 +-
 cpp/tests/join/join_tests.cpp                 |   6 +-
 cpp/tests/join/semi_anti_join_tests.cpp       |   6 +-
 .../main/native/include/maps_column_view.hpp  |  20 +-
 java/src/main/native/src/RmmJni.cpp           |  10 +-
 java/src/main/native/src/maps_column_view.cu  |  23 +--
 539 files changed, 3613 insertions(+), 3012 deletions(-)

diff --git a/cpp/benchmarks/copying/shift.cu b/cpp/benchmarks/copying/shift.cu
index e1169e3bcd6..efc385cf10b 100644
--- a/cpp/benchmarks/copying/shift.cu
+++ b/cpp/benchmarks/copying/shift.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,11 +21,13 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
 std::unique_ptr<cudf::scalar> make_scalar(
-  T value                             = 0,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  T value                           = 0,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   auto s = new ScalarType(value, true, stream, mr);
   return std::unique_ptr<cudf::scalar>(s);
diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp
index adde0ae1720..8c8d6756b00 100644
--- a/cpp/benchmarks/fixture/benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp
@@ -120,6 +120,7 @@ class memory_stats_logger {
   }
 
  private:
+  // TODO change to resource_ref once set_current_device_resource supports it
   rmm::mr::device_memory_resource* existing_mr;
   rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource> statistics_mr;
 };
diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index a36a831a7aa..38f7ac5291f 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/scan.h>
 
 #include <functional>
@@ -118,7 +120,7 @@ class expression_parser {
                     std::optional<std::reference_wrapper<cudf::table_view const>> right,
                     bool has_nulls,
                     rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr)
+                    rmm::device_async_resource_ref mr)
     : _left{left},
       _right{right},
       _expression_count{0},
@@ -139,7 +141,7 @@ class expression_parser {
                     cudf::table_view const& table,
                     bool has_nulls,
                     rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr)
+                    rmm::device_async_resource_ref mr)
     : expression_parser(expr, table, {}, has_nulls, stream, mr)
   {
   }
@@ -240,7 +242,7 @@ class expression_parser {
     data_pointers.push_back(v.data());
   }
 
-  void move_to_device(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  void move_to_device(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   {
     std::vector<cudf::size_type> sizes;
     std::vector<void const*> data_pointers;
diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index 9df4b4eb00f..20550e92f9f 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/scalar/scalar.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -116,8 +117,8 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   binary_operator op,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a binary operation between a column and a scalar.
@@ -147,8 +148,8 @@ std::unique_ptr<column> binary_operation(
   scalar const& rhs,
   binary_operator op,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a binary operation between two columns.
@@ -177,8 +178,8 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   binary_operator op,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a binary operation between two columns using a
@@ -208,8 +209,8 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   std::string const& ptx,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the `scale` for a `fixed_point` number based on given binary operator `op`
@@ -249,8 +250,8 @@ namespace binops {
 std::pair<rmm::device_buffer, size_type> scalar_col_valid_mask_and(
   column_view const& col,
   scalar const& s,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 namespace compiled {
 namespace detail {
diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp
index 023e58c5300..22db25bdc83 100644
--- a/cpp/include/cudf/column/column.hpp
+++ b/cpp/include/cudf/column/column.hpp
@@ -24,6 +24,7 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <type_traits>
@@ -63,8 +64,8 @@ class column {
    * @param mr Device memory resource to use for all device memory allocations
    */
   column(column const& other,
-         rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-         rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+         rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+         rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Move the contents from `other` to create a new column.
@@ -141,8 +142,8 @@ class column {
    * @param mr Device memory resource to use for all device memory allocations
    */
   explicit column(column_view view,
-                  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Returns the column's logical element type
diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index 96322159f0f..dc4700576e6 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/pair.h>
 
@@ -75,9 +76,9 @@ std::unique_ptr<column> make_empty_column(type_id id);
 std::unique_ptr<column> make_numeric_column(
   data_type type,
   size_type size,
-  mask_state state                    = mask_state::UNALLOCATED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  mask_state state                  = mask_state::UNALLOCATED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -102,8 +103,8 @@ std::unique_ptr<column> make_numeric_column(
   size_type size,
   B&& null_mask,
   size_type null_count,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type.");
   return std::make_unique<column>(type,
@@ -133,9 +134,9 @@ std::unique_ptr<column> make_numeric_column(
 std::unique_ptr<column> make_fixed_point_column(
   data_type type,
   size_type size,
-  mask_state state                    = mask_state::UNALLOCATED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  mask_state state                  = mask_state::UNALLOCATED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -159,8 +160,8 @@ std::unique_ptr<column> make_fixed_point_column(
   size_type size,
   B&& null_mask,
   size_type null_count,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type.");
   return std::make_unique<column>(type,
@@ -191,9 +192,9 @@ std::unique_ptr<column> make_fixed_point_column(
 std::unique_ptr<column> make_timestamp_column(
   data_type type,
   size_type size,
-  mask_state state                    = mask_state::UNALLOCATED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  mask_state state                  = mask_state::UNALLOCATED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -218,8 +219,8 @@ std::unique_ptr<column> make_timestamp_column(
   size_type size,
   B&& null_mask,
   size_type null_count,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.");
   return std::make_unique<column>(type,
@@ -250,9 +251,9 @@ std::unique_ptr<column> make_timestamp_column(
 std::unique_ptr<column> make_duration_column(
   data_type type,
   size_type size,
-  mask_state state                    = mask_state::UNALLOCATED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  mask_state state                  = mask_state::UNALLOCATED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -277,8 +278,8 @@ std::unique_ptr<column> make_duration_column(
   size_type size,
   B&& null_mask,
   size_type null_count,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.");
   return std::make_unique<column>(type,
@@ -309,9 +310,9 @@ std::unique_ptr<column> make_duration_column(
 std::unique_ptr<column> make_fixed_width_column(
   data_type type,
   size_type size,
-  mask_state state                    = mask_state::UNALLOCATED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  mask_state state                  = mask_state::UNALLOCATED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -336,8 +337,8 @@ std::unique_ptr<column> make_fixed_width_column(
   size_type size,
   B&& null_mask,
   size_type null_count,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_fixed_width(type), "Invalid, non-fixed-width type.");
   if (is_timestamp(type)) {
@@ -375,8 +376,8 @@ std::unique_ptr<column> make_fixed_width_column(
  */
 std::unique_ptr<column> make_strings_column(
   cudf::device_span<thrust::pair<char const*, size_type> const> strings,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a STRING type column given a device span of string_view.
@@ -407,8 +408,8 @@ std::unique_ptr<column> make_strings_column(
 std::unique_ptr<column> make_strings_column(
   cudf::device_span<string_view const> string_views,
   string_view const null_placeholder,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a STRING type column given offsets column, chars columns, and null mask and null
@@ -495,8 +496,8 @@ std::unique_ptr<cudf::column> make_lists_column(
   std::unique_ptr<column> child_column,
   size_type null_count,
   rmm::device_buffer&& null_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a STRUCT column using specified child columns as members.
@@ -526,8 +527,8 @@ std::unique_ptr<cudf::column> make_structs_column(
   std::vector<std::unique_ptr<column>>&& child_columns,
   size_type null_count,
   rmm::device_buffer&& null_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a column with size elements that are all equal to the given scalar.
@@ -546,8 +547,8 @@ std::unique_ptr<cudf::column> make_structs_column(
 std::unique_ptr<column> make_column_from_scalar(
   scalar const& s,
   size_type size,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a dictionary column with size elements that are all equal to the given scalar.
@@ -566,8 +567,8 @@ std::unique_ptr<column> make_column_from_scalar(
 std::unique_ptr<column> make_dictionary_from_scalar(
   scalar const& s,
   size_type size,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/concatenate.hpp b/cpp/include/cudf/concatenate.hpp
index 9ee55275a5e..e7b55a2e6d0 100644
--- a/cpp/include/cudf/concatenate.hpp
+++ b/cpp/include/cudf/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -46,8 +47,8 @@ namespace cudf {
  */
 rmm::device_buffer concatenate_masks(
   host_span<column_view const> views,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Concatenates multiple columns into a single column
@@ -63,8 +64,8 @@ rmm::device_buffer concatenate_masks(
  */
 std::unique_ptr<column> concatenate(
   host_span<column_view const> columns_to_concat,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Columns of `tables_to_concat` are concatenated vertically to return a
@@ -92,8 +93,8 @@ std::unique_ptr<column> concatenate(
  */
 std::unique_ptr<table> concatenate(
   host_span<table_view const> tables_to_concat,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp
index 1bbbf73bd5d..0d4f20d1ef2 100644
--- a/cpp/include/cudf/contiguous_split.hpp
+++ b/cpp/include/cudf/contiguous_split.hpp
@@ -19,6 +19,8 @@
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <memory>
 #include <vector>
 
@@ -119,7 +121,7 @@ struct packed_table {
 std::vector<packed_table> contiguous_split(
   cudf::table_view const& input,
   std::vector<size_type> const& splits,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 namespace detail {
 struct contiguous_split_state;
@@ -196,7 +198,7 @@ class chunked_pack {
   explicit chunked_pack(
     cudf::table_view const& input,
     std::size_t user_buffer_size,
-    rmm::mr::device_memory_resource* temp_mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref temp_mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Destructor that will be implemented as default. Declared with definition here because
@@ -261,7 +263,7 @@ class chunked_pack {
   [[nodiscard]] static std::unique_ptr<chunked_pack> create(
     cudf::table_view const& input,
     std::size_t user_buffer_size,
-    rmm::mr::device_memory_resource* temp_mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref temp_mr = rmm::mr::get_current_device_resource());
 
  private:
   // internal state of contiguous split
@@ -281,7 +283,7 @@ class chunked_pack {
  *         and device memory respectively
  */
 packed_columns pack(cudf::table_view const& input,
-                    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Produce the metadata used for packing a table stored in a contiguous buffer.
diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
index df96efdaffc..b17cafb05ab 100644
--- a/cpp/include/cudf/copying.hpp
+++ b/cpp/include/cudf/copying.hpp
@@ -25,6 +25,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -84,9 +85,9 @@ enum class out_of_bounds_policy : bool {
 std::unique_ptr<table> gather(
   table_view const& source_table,
   column_view const& gather_map,
-  out_of_bounds_policy bounds_policy  = out_of_bounds_policy::DONT_CHECK,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  out_of_bounds_policy bounds_policy = out_of_bounds_policy::DONT_CHECK,
+  rmm::cuda_stream_view stream       = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr  = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Reverses the rows within a table.
@@ -105,8 +106,8 @@ std::unique_ptr<table> gather(
  */
 std::unique_ptr<table> reverse(
   table_view const& source_table,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Reverses the elements of a column
@@ -125,8 +126,8 @@ std::unique_ptr<table> reverse(
  */
 std::unique_ptr<column> reverse(
   column_view const& source_column,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Scatters the rows of the source table into a copy of the target table
@@ -174,8 +175,8 @@ std::unique_ptr<table> scatter(
   table_view const& source,
   column_view const& scatter_map,
   table_view const& target,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Scatters a row of scalar values into a copy of the target table
@@ -217,8 +218,8 @@ std::unique_ptr<table> scatter(
   std::vector<std::reference_wrapper<scalar const>> const& source,
   column_view const& indices,
   table_view const& target,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Indicates when to allocate a mask, based on an existing mask.
@@ -264,9 +265,9 @@ std::unique_ptr<column> empty_like(scalar const& input);
  */
 std::unique_ptr<column> allocate_like(
   column_view const& input,
-  mask_allocation_policy mask_alloc   = mask_allocation_policy::RETAIN,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  mask_allocation_policy mask_alloc = mask_allocation_policy::RETAIN,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates an uninitialized new column of the specified size and same type as the `input`.
@@ -287,9 +288,9 @@ std::unique_ptr<column> allocate_like(
 std::unique_ptr<column> allocate_like(
   column_view const& input,
   size_type size,
-  mask_allocation_policy mask_alloc   = mask_allocation_policy::RETAIN,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  mask_allocation_policy mask_alloc = mask_allocation_policy::RETAIN,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a table of empty columns with the same types as the `input_table`
@@ -380,8 +381,8 @@ std::unique_ptr<column> copy_range(
   size_type source_begin,
   size_type source_end,
   size_type target_begin,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a new column by shifting all values by an offset.
@@ -424,8 +425,8 @@ std::unique_ptr<column> shift(
   column_view const& input,
   size_type offset,
   scalar const& fill_value,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Slices a `column_view` into a set of `column_view`s according to a set of indices.
@@ -627,8 +628,8 @@ std::unique_ptr<column> copy_if_else(
   column_view const& lhs,
   column_view const& rhs,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief   Returns a new column, where each element is selected from either @p lhs or
@@ -653,8 +654,8 @@ std::unique_ptr<column> copy_if_else(
   scalar const& lhs,
   column_view const& rhs,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief   Returns a new column, where each element is selected from either @p lhs or
@@ -679,8 +680,8 @@ std::unique_ptr<column> copy_if_else(
   column_view const& lhs,
   scalar const& rhs,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief   Returns a new column, where each element is selected from either @p lhs or
@@ -703,8 +704,8 @@ std::unique_ptr<column> copy_if_else(
   scalar const& lhs,
   scalar const& rhs,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Scatters rows from the input table to rows of the output corresponding
@@ -747,8 +748,8 @@ std::unique_ptr<table> boolean_mask_scatter(
   table_view const& input,
   table_view const& target,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Scatters scalar values to rows of the output corresponding
@@ -786,8 +787,8 @@ std::unique_ptr<table> boolean_mask_scatter(
   std::vector<std::reference_wrapper<scalar const>> const& input,
   table_view const& target,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Get the element at specified index from a column
@@ -806,8 +807,8 @@ std::unique_ptr<table> boolean_mask_scatter(
 std::unique_ptr<scalar> get_element(
   column_view const& input,
   size_type index,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Indicates whether a row can be sampled more than once.
@@ -851,7 +852,7 @@ std::unique_ptr<table> sample(
   sample_with_replacement replacement = sample_with_replacement::FALSE,
   int64_t const seed                  = 0,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Checks if a column or its descendants have non-empty null rows
@@ -967,8 +968,8 @@ bool may_have_nonempty_nulls(column_view const& input);
  */
 std::unique_ptr<column> purge_nonempty_nulls(
   column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */
 }  // namespace cudf
diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp
index 44736ca0762..06b7d24f6cd 100644
--- a/cpp/include/cudf/datetime.hpp
+++ b/cpp/include/cudf/datetime.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -47,7 +48,7 @@ namespace datetime {
  */
 std::unique_ptr<cudf::column> extract_year(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts month from any datetime type and returns an int16_t
@@ -61,7 +62,7 @@ std::unique_ptr<cudf::column> extract_year(
  */
 std::unique_ptr<cudf::column> extract_month(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts day from any datetime type and returns an int16_t
@@ -75,7 +76,7 @@ std::unique_ptr<cudf::column> extract_month(
  */
 std::unique_ptr<cudf::column> extract_day(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts a weekday from any datetime type and returns an int16_t
@@ -89,7 +90,7 @@ std::unique_ptr<cudf::column> extract_day(
  */
 std::unique_ptr<cudf::column> extract_weekday(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts hour from any datetime type and returns an int16_t
@@ -103,7 +104,7 @@ std::unique_ptr<cudf::column> extract_weekday(
  */
 std::unique_ptr<cudf::column> extract_hour(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts minute from any datetime type and returns an int16_t
@@ -117,7 +118,7 @@ std::unique_ptr<cudf::column> extract_hour(
  */
 std::unique_ptr<cudf::column> extract_minute(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts second from any datetime type and returns an int16_t
@@ -131,7 +132,7 @@ std::unique_ptr<cudf::column> extract_minute(
  */
 std::unique_ptr<cudf::column> extract_second(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts millisecond fraction from any datetime type and returns an int16_t
@@ -148,7 +149,7 @@ std::unique_ptr<cudf::column> extract_second(
  */
 std::unique_ptr<cudf::column> extract_millisecond_fraction(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts microsecond fraction from any datetime type and returns an int16_t
@@ -165,7 +166,7 @@ std::unique_ptr<cudf::column> extract_millisecond_fraction(
  */
 std::unique_ptr<cudf::column> extract_microsecond_fraction(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts nanosecond fraction from any datetime type and returns an int16_t
@@ -182,7 +183,7 @@ std::unique_ptr<cudf::column> extract_microsecond_fraction(
  */
 std::unique_ptr<cudf::column> extract_nanosecond_fraction(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 /**
@@ -203,7 +204,7 @@ std::unique_ptr<cudf::column> extract_nanosecond_fraction(
  */
 std::unique_ptr<cudf::column> last_day_of_month(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Computes the day number since the start of the year from the datetime and
@@ -217,7 +218,7 @@ std::unique_ptr<cudf::column> last_day_of_month(
  */
 std::unique_ptr<cudf::column> day_of_year(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Adds or subtracts a number of months from the datetime type and returns a
@@ -252,7 +253,7 @@ std::unique_ptr<cudf::column> day_of_year(
 std::unique_ptr<cudf::column> add_calendrical_months(
   cudf::column_view const& timestamps,
   cudf::column_view const& months,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Adds or subtracts a number of months from the datetime type and returns a
@@ -287,7 +288,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(
 std::unique_ptr<cudf::column> add_calendrical_months(
   cudf::column_view const& timestamps,
   cudf::scalar const& months,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Check if the year of the given date is a leap year
@@ -304,7 +305,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(
  */
 std::unique_ptr<cudf::column> is_leap_year(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Extract the number of days in the month
@@ -320,7 +321,7 @@ std::unique_ptr<cudf::column> is_leap_year(
  */
 std::unique_ptr<cudf::column> days_in_month(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Returns the quarter of the date
@@ -336,7 +337,7 @@ std::unique_ptr<cudf::column> days_in_month(
  */
 std::unique_ptr<cudf::column> extract_quarter(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Fixed frequencies supported by datetime rounding functions ceil, floor, round.
@@ -365,7 +366,7 @@ enum class rounding_frequency : int32_t {
 std::unique_ptr<cudf::column> ceil_datetimes(
   cudf::column_view const& column,
   rounding_frequency freq,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Round datetimes down to the nearest multiple of the given frequency.
@@ -380,7 +381,7 @@ std::unique_ptr<cudf::column> ceil_datetimes(
 std::unique_ptr<cudf::column> floor_datetimes(
   cudf::column_view const& column,
   rounding_frequency freq,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Round datetimes to the nearest multiple of the given frequency.
@@ -395,7 +396,7 @@ std::unique_ptr<cudf::column> floor_datetimes(
 std::unique_ptr<cudf::column> round_datetimes(
   cudf::column_view const& column,
   rounding_frequency freq,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/detail/binaryop.hpp b/cpp/include/cudf/detail/binaryop.hpp
index e5609568d10..de1fde8bc96 100644
--- a/cpp/include/cudf/detail/binaryop.hpp
+++ b/cpp/include/cudf/detail/binaryop.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 //! Inner interfaces and implementations
@@ -26,7 +27,7 @@ namespace detail {
 
 /**
  * @copydoc cudf::binary_operation(column_view const&, column_view const&,
- * std::string const&, data_type, rmm::mr::device_memory_resource *)
+ * std::string const&, data_type, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -35,11 +36,11 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          std::string const& ptx,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::binary_operation(scalar const&, column_view const&, binary_operator,
- * data_type, rmm::mr::device_memory_resource *)
+ * data_type, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -48,11 +49,11 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::binary_operation(column_view const&, scalar const&, binary_operator,
- * data_type, rmm::mr::device_memory_resource *)
+ * data_type, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -61,11 +62,11 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::binary_operation(column_view const&, column_view const&,
- * binary_operator, data_type, rmm::mr::device_memory_resource *)
+ * binary_operator, data_type, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -74,6 +75,6 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/calendrical_month_sequence.cuh b/cpp/include/cudf/detail/calendrical_month_sequence.cuh
index 59fb6758973..a9cf54e29b8 100644
--- a/cpp/include/cudf/detail/calendrical_month_sequence.cuh
+++ b/cpp/include/cudf/detail/calendrical_month_sequence.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -38,7 +39,7 @@ struct calendrical_month_sequence_functor {
     scalar const& input,
     size_type months,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     // Return empty column if n = 0
     if (n == 0) return cudf::make_empty_column(input.type());
diff --git a/cpp/include/cudf/detail/concatenate.hpp b/cpp/include/cudf/detail/concatenate.hpp
index 442814bc4fd..3e039175542 100644
--- a/cpp/include/cudf/detail/concatenate.hpp
+++ b/cpp/include/cudf/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <vector>
 
@@ -29,22 +30,22 @@ namespace cudf {
 //! Inner interfaces and implementations
 namespace detail {
 /**
- * @copydoc cudf::concatenate(host_span<column_view const>,rmm::mr::device_memory_resource*)
+ * @copydoc cudf::concatenate(host_span<column_view const>,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_concat,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::concatenate(host_span<table_view const>,rmm::mr::device_memory_resource*)
+ * @copydoc cudf::concatenate(host_span<table_view const>,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/concatenate_masks.hpp b/cpp/include/cudf/detail/concatenate_masks.hpp
index e7086ea17a5..dd2fb471a7d 100644
--- a/cpp/include/cudf/detail/concatenate_masks.hpp
+++ b/cpp/include/cudf/detail/concatenate_masks.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 //! Inner interfaces and implementations
@@ -59,13 +60,13 @@ size_type concatenate_masks(host_span<column_view const> views,
                             rmm::cuda_stream_view stream);
 
 /**
- * @copydoc cudf::concatenate_masks(host_span<column_view const>, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::concatenate_masks(host_span<column_view const>, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 rmm::device_buffer concatenate_masks(host_span<column_view const> views,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/contiguous_split.hpp b/cpp/include/cudf/detail/contiguous_split.hpp
index d9a35470b7d..de00b61cdca 100644
--- a/cpp/include/cudf/detail/contiguous_split.hpp
+++ b/cpp/include/cudf/detail/contiguous_split.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -33,7 +34,7 @@ namespace detail {
 std::vector<packed_table> contiguous_split(cudf::table_view const& input,
                                            std::vector<size_type> const& splits,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::pack
@@ -42,7 +43,7 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
  **/
 packed_columns pack(cudf::table_view const& input,
                     rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr);
+                    rmm::device_async_resource_ref mr);
 
 // opaque implementation of `metadata_builder` since it needs to use
 // `serialized_column`, which is only defined in pack.cpp
diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
index 115822163c3..f7430eb090d 100644
--- a/cpp/include/cudf/detail/copy.hpp
+++ b/cpp/include/cudf/detail/copy.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <initializer_list>
 
@@ -123,7 +124,7 @@ std::vector<table_view> split(table_view const& input,
 
 /**
  * @copydoc cudf::shift(column_view const&,size_type,scalar const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -131,7 +132,7 @@ std::unique_ptr<column> shift(column_view const& input,
                               size_type offset,
                               scalar const& fill_value,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 /**
  * @brief Performs segmented shifts for specified values.
@@ -171,11 +172,11 @@ std::unique_ptr<column> segmented_shift(column_view const& segmented_values,
                                         size_type offset,
                                         scalar const& fill_value,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::allocate_like(column_view const&, size_type, mask_allocation_policy,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -183,11 +184,11 @@ std::unique_ptr<column> allocate_like(column_view const& input,
                                       size_type size,
                                       mask_allocation_policy mask_alloc,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::copy_if_else( column_view const&, column_view const&,
- * column_view const&, rmm::mr::device_memory_resource*)
+ * column_view const&, rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -195,11 +196,11 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      column_view const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::copy_if_else( scalar const&, column_view const&,
- * column_view const&, rmm::mr::device_memory_resource*)
+ * column_view const&, rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -207,11 +208,11 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      column_view const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::copy_if_else( column_view const&, scalar const&,
- * column_view const&, rmm::mr::device_memory_resource*)
+ * column_view const&, rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -219,11 +220,11 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      scalar const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::copy_if_else( scalar const&, scalar const&,
- * column_view const&, rmm::mr::device_memory_resource*)
+ * column_view const&, rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -231,7 +232,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      scalar const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::sample
@@ -243,7 +244,7 @@ std::unique_ptr<table> sample(table_view const& input,
                               sample_with_replacement replacement,
                               int64_t const seed,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::get_element
@@ -253,7 +254,7 @@ std::unique_ptr<table> sample(table_view const& input,
 std::unique_ptr<scalar> get_element(column_view const& input,
                                     size_type index,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::has_nonempty_nulls
@@ -276,7 +277,7 @@ bool may_have_nonempty_nulls(column_view const& input, rmm::cuda_stream_view str
  */
 std::unique_ptr<column> purge_nonempty_nulls(column_view const& input,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index 3af050a5da6..c98057d077a 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -37,6 +37,7 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <cuda/atomic>
@@ -239,7 +240,7 @@ struct scatter_gather_functor {
                                            Filter filter,
                                            cudf::size_type per_thread,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     auto output_column = cudf::detail::allocate_like(
       input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr);
@@ -286,7 +287,7 @@ struct scatter_gather_functor {
                                            Filter filter,
                                            cudf::size_type,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     rmm::device_uvector<cudf::size_type> indices(output_size, stream);
 
@@ -325,7 +326,7 @@ template <typename Filter>
 std::unique_ptr<table> copy_if(table_view const& input,
                                Filter filter,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index 6162fa5ecf1..ac5cb0ad141 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -23,6 +23,7 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 
 #include <rmm/device_scalar.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/optional.h>
@@ -152,7 +153,7 @@ std::unique_ptr<column> copy_if_else(bool nullable,
                                      FilterFn filter,
                                      cudf::data_type output_type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   // This is the type of the thrust::optional element in the passed iterators
   using Element = typename thrust::iterator_traits<LeftIter>::value_type::value_type;
diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
index 9f8b0f8b619..1b3b2056c6c 100644
--- a/cpp/include/cudf/detail/copy_range.cuh
+++ b/cpp/include/cudf/detail/copy_range.cuh
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <cuda_runtime.h>
@@ -203,7 +204,7 @@ std::unique_ptr<column> copy_range(column_view const& source,
                                    size_type source_end,
                                    size_type target_begin,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp
index c5160958165..a93c06d4371 100644
--- a/cpp/include/cudf/detail/datetime.hpp
+++ b/cpp/include/cudf/detail/datetime.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,156 +19,158 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <memory>
 
 namespace cudf {
 namespace datetime {
 namespace detail {
 /**
- * @copydoc cudf::extract_year(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::extract_year(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_year(cudf::column_view const& column,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_month(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::extract_month(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_month(cudf::column_view const& column,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_day(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::extract_day(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_day(cudf::column_view const& column,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_weekday(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::extract_weekday(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_weekday(cudf::column_view const& column,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr);
+                                              rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_hour(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::extract_hour(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_hour(cudf::column_view const& column,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_minute(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::extract_minute(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_minute(cudf::column_view const& column,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_second(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::extract_second(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_second(cudf::column_view const& column,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::extract_millisecond_fraction(cudf::column_view const&,
- * rmm::mr::device_memory_resource *)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_millisecond_fraction(cudf::column_view const& column,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr);
+                                                           rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::extract_microsecond_fraction(cudf::column_view const&,
- * rmm::mr::device_memory_resource *)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_microsecond_fraction(cudf::column_view const& column,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr);
+                                                           rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::extract_nanosecond_fraction(cudf::column_view const&,
- * rmm::mr::device_memory_resource *)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_nanosecond_fraction(cudf::column_view const& column,
                                                           rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource* mr);
+                                                          rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> last_day_of_month(cudf::column_view const& column,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr);
+                                                rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::day_of_year(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::day_of_year(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> day_of_year(cudf::column_view const& column,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::add_calendrical_months(cudf::column_view const&, cudf::column_view const&,
- * rmm::mr::device_memory_resource *)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamps,
                                                      cudf::column_view const& months,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr);
+                                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::add_calendrical_months(cudf::column_view const&, cudf::scalar const&,
- * rmm::mr::device_memory_resource *)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamps,
                                                      cudf::scalar const& months,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr);
+                                                     rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::is_leap_year(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::is_leap_year(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> is_leap_year(cudf::column_view const& column,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 std::unique_ptr<cudf::column> extract_quarter(cudf::column_view const& column,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr);
+                                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace datetime
diff --git a/cpp/include/cudf/detail/distinct_hash_join.cuh b/cpp/include/cudf/detail/distinct_hash_join.cuh
index e874151ed36..93d52d5dda3 100644
--- a/cpp/include/cudf/detail/distinct_hash_join.cuh
+++ b/cpp/include/cudf/detail/distinct_hash_join.cuh
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_set.cuh>
 
@@ -148,12 +149,12 @@ struct distinct_hash_join {
    */
   std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
             std::unique_ptr<rmm::device_uvector<size_type>>>
-  inner_join(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const;
+  inner_join(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const;
 
   /**
    * @copydoc cudf::distinct_hash_join::left_join
    */
   std::unique_ptr<rmm::device_uvector<size_type>> left_join(
-    rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const;
+    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const;
 };
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/fill.hpp b/cpp/include/cudf/detail/fill.hpp
index caaccfb4851..6996cda6974 100644
--- a/cpp/include/cudf/detail/fill.hpp
+++ b/cpp/include/cudf/detail/fill.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -48,7 +49,7 @@ std::unique_ptr<column> fill(column_view const& input,
                              size_type end,
                              scalar const& value,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr);
+                             rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 6492aa23e80..c9d350ce983 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -38,6 +38,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/gather.h>
@@ -174,7 +175,7 @@ struct column_gatherer {
                                      MapIterator gather_map_end,
                                      bool nullify_out_of_bounds,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     column_gatherer_impl<Element> gatherer{};
 
@@ -214,7 +215,7 @@ struct column_gatherer_impl<Element, std::enable_if_t<is_rep_layout_compatible<E
                                      MapIterator gather_map_end,
                                      bool nullify_out_of_bounds,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto const num_rows = cudf::distance(gather_map_begin, gather_map_end);
     auto const policy   = cudf::mask_allocation_policy::NEVER;
@@ -260,7 +261,7 @@ struct column_gatherer_impl<string_view> {
                                      MapItType gather_map_end,
                                      bool nullify_out_of_bounds,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     if (true == nullify_out_of_bounds) {
       return cudf::strings::detail::gather<true>(
@@ -334,7 +335,7 @@ struct column_gatherer_impl<list_view> {
                                      MapItRoot gather_map_end,
                                      bool nullify_out_of_bounds,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     lists_column_view list(column);
     auto gather_map_size = std::distance(gather_map_begin, gather_map_end);
@@ -397,7 +398,7 @@ struct column_gatherer_impl<dictionary32> {
                                      MapItType gather_map_end,
                                      bool nullify_out_of_bounds,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     dictionary_column_view dictionary(source_column);
     auto output_count = std::distance(gather_map_begin, gather_map_end);
@@ -448,7 +449,7 @@ struct column_gatherer_impl<struct_view> {
                                      MapItRoot gather_map_end,
                                      bool nullify_out_of_bounds,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto const gather_map_size = std::distance(gather_map_begin, gather_map_end);
     if (gather_map_size == 0) { return empty_like(column); }
@@ -554,7 +555,7 @@ void gather_bitmask(table_view const& source,
                     std::vector<std::unique_ptr<column>>& target,
                     gather_bitmask_op op,
                     rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr)
+                    rmm::device_async_resource_ref mr)
 {
   if (target.empty()) { return; }
 
@@ -652,7 +653,7 @@ std::unique_ptr<table> gather(table_view const& source_table,
                               MapIterator gather_map_end,
                               out_of_bounds_policy bounds_policy,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   std::vector<std::unique_ptr<column>> destination_columns;
 
diff --git a/cpp/include/cudf/detail/gather.hpp b/cpp/include/cudf/detail/gather.hpp
index 034eb6c1282..36824f56895 100644
--- a/cpp/include/cudf/detail/gather.hpp
+++ b/cpp/include/cudf/detail/gather.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -66,12 +67,12 @@ std::unique_ptr<table> gather(table_view const& source_table,
                               out_of_bounds_policy bounds_policy,
                               negative_index_policy neg_indices,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::detail::gather(table_view const&,column_view const&,table_view
  * const&,cudf::out_of_bounds_policy,cudf::detail::negative_index_policy,rmm::cuda_stream_view,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @throws cudf::logic_error if `gather_map` span size is larger than max of `size_type`.
  */
@@ -80,7 +81,7 @@ std::unique_ptr<table> gather(table_view const& source_table,
                               out_of_bounds_policy bounds_policy,
                               negative_index_policy neg_indices,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/groupby.hpp b/cpp/include/cudf/detail/groupby.hpp
index 0afa69be1a3..5a8c9b0a27f 100644
--- a/cpp/include/cudf/detail/groupby.hpp
+++ b/cpp/include/cudf/detail/groupby.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <utility>
@@ -45,7 +46,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
   host_span<aggregation_request const> requests,
   null_policy include_null_keys,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 }  // namespace hash
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
index e081a626c75..389c7952875 100644
--- a/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
+++ b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 namespace cudf {
 namespace groupby {
 namespace detail {
@@ -40,7 +41,7 @@ std::unique_ptr<column> group_replace_nulls(cudf::column_view const& grouped_val
                                             device_span<size_type const> group_labels,
                                             cudf::replace_policy replace_policy,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace groupby
diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp
index 7b386eb5f03..567efedb9b2 100644
--- a/cpp/include/cudf/detail/groupby/sort_helper.hpp
+++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -87,7 +88,7 @@ struct sort_groupby_helper {
    */
   std::unique_ptr<column> sorted_values(column_view const& values,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
   /**
    * @brief Groups a column of values according to `keys`
@@ -101,7 +102,7 @@ struct sort_groupby_helper {
    */
   std::unique_ptr<column> grouped_values(column_view const& values,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
   /**
    * @brief Get a table of sorted unique keys
@@ -109,7 +110,7 @@ struct sort_groupby_helper {
    * @return a new table in which each row is a unique row in the sorted key table.
    */
   std::unique_ptr<table> unique_keys(rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
   /**
    * @brief Get a table of sorted keys
@@ -117,7 +118,7 @@ struct sort_groupby_helper {
    * @return a new table containing the sorted keys.
    */
   std::unique_ptr<table> sorted_keys(rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
   /**
    * @brief Get the number of groups in `keys`
diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
index 1df6848c575..dfe79646167 100644
--- a/cpp/include/cudf/detail/hash_reduce_by_row.cuh
+++ b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
@@ -22,6 +22,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_map.cuh>
 #include <thrust/for_each.h>
@@ -124,7 +125,7 @@ rmm::device_uvector<OutputType> hash_reduce_by_row(
   ReduceFuncBuilder func_builder,
   OutputType init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto const map_dview  = map.get_device_view();
   auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp
index 296b68d22a9..5b2b9b5e69d 100644
--- a/cpp/include/cudf/detail/interop.hpp
+++ b/cpp/include/cudf/detail/interop.hpp
@@ -23,6 +23,8 @@
 #pragma nv_diag_suppress 611
 #pragma nv_diag_suppress 2810
 #endif
+#include <rmm/resource_ref.hpp>
+
 #include <arrow/api.h>
 #ifdef __CUDACC__
 #pragma nv_diag_default 611
@@ -47,7 +49,7 @@ namespace detail {
  */
 std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::to_dlpack
@@ -56,7 +58,7 @@ std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
  */
 DLManagedTensor* to_dlpack(table_view const& input,
                            rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr);
+                           rmm::device_async_resource_ref mr);
 
 // Creating arrow as per given type_id and buffer arguments
 template <typename... Ts>
@@ -127,19 +129,19 @@ std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
                                         arrow::MemoryPool* ar_mr);
 /**
  * @copydoc cudf::from_arrow(arrow::Table const& input_table, rmm::cuda_stream_view stream,
- * rmm::mr::device_memory_resource* mr)
+ * rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::from_arrow(arrow::Scalar const& input, rmm::cuda_stream_view stream,
- * rmm::mr::device_memory_resource* mr)
+ * rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @brief Return a maximum precision for a given type.
diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp
index 27d14874bce..aabfff746ea 100644
--- a/cpp/include/cudf/detail/join.hpp
+++ b/cpp/include/cudf/detail/join.hpp
@@ -24,6 +24,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_multimap.cuh>
 
@@ -105,7 +106,7 @@ struct hash_join {
   inner_join(cudf::table_view const& probe,
              std::optional<std::size_t> output_size,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr) const;
+             rmm::device_async_resource_ref mr) const;
 
   /**
    * @copydoc cudf::hash_join::left_join
@@ -115,7 +116,7 @@ struct hash_join {
   left_join(cudf::table_view const& probe,
             std::optional<std::size_t> output_size,
             rmm::cuda_stream_view stream,
-            rmm::mr::device_memory_resource* mr) const;
+            rmm::device_async_resource_ref mr) const;
 
   /**
    * @copydoc cudf::hash_join::full_join
@@ -125,7 +126,7 @@ struct hash_join {
   full_join(cudf::table_view const& probe,
             std::optional<std::size_t> output_size,
             rmm::cuda_stream_view stream,
-            rmm::mr::device_memory_resource* mr) const;
+            rmm::device_async_resource_ref mr) const;
 
   /**
    * @copydoc cudf::hash_join::inner_join_size
@@ -144,7 +145,7 @@ struct hash_join {
    */
   std::size_t full_join_size(cudf::table_view const& probe,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr) const;
+                             rmm::device_async_resource_ref mr) const;
 
  private:
   /**
@@ -169,7 +170,7 @@ struct hash_join {
                      join_kind join,
                      std::optional<std::size_t> output_size,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr) const;
+                     rmm::device_async_resource_ref mr) const;
 
   /**
    * @copydoc cudf::detail::hash_join::probe_join_indices
@@ -184,7 +185,7 @@ struct hash_join {
                     join_kind join,
                     std::optional<std::size_t> output_size,
                     rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr) const;
+                    rmm::device_async_resource_ref mr) const;
 };
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/label_bins.hpp b/cpp/include/cudf/detail/label_bins.hpp
index 50eeba58cdd..9f6dcce448d 100644
--- a/cpp/include/cudf/detail/label_bins.hpp
+++ b/cpp/include/cudf/detail/label_bins.hpp
@@ -25,6 +25,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -40,7 +41,7 @@ namespace detail {
 /**
  * @copydoc cudf::label_bins(column_view const& input, column_view const& left_edges, inclusive
  * left_inclusive, column_view const& right_edges, inclusive right_inclusive, rmm::cuda_stream_view,
- * rmm::mr::device_memory_resource* mr)
+ * rmm::device_async_resource_ref mr)
  *
  * @param stream Stream view on which to allocate resources and queue execution.
  */
@@ -50,7 +51,7 @@ std::unique_ptr<column> label_bins(column_view const& input,
                                    column_view const& right_edges,
                                    inclusive right_inclusive,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 /** @} */  // end of group
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/merge.hpp b/cpp/include/cudf/detail/merge.hpp
index 2167a484214..837eda0d7b5 100644
--- a/cpp/include/cudf/detail/merge.hpp
+++ b/cpp/include/cudf/detail/merge.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/pair.h>
 
@@ -45,7 +46,7 @@ using index_vector = rmm::device_uvector<index_type>;
  *            std::vector<cudf::size_type> const& key_cols,
  *            std::vector<cudf::order> const& column_order,
  *            std::vector<cudf::null_order> const& null_precedence,
- *            rmm::mr::device_memory_resource* mr)
+ *            rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
@@ -54,7 +55,7 @@ std::unique_ptr<cudf::table> merge(std::vector<table_view> const& tables_to_merg
                                    std::vector<cudf::order> const& column_order,
                                    std::vector<cudf::null_order> const& null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index db373f47a01..e62675cbc8c 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -26,6 +26,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/block/block_reduce.cuh>
 #include <cub/device/device_segmented_reduce.cuh>
@@ -110,7 +111,7 @@ CUDF_KERNEL void offset_bitmask_binop(Binop op,
 
 /**
  * @copydoc bitmask_binop(Binop op, host_span<bitmask_type const* const>, host_span<size_type>
- * const, size_type, rmm::mr::device_memory_resource *)
+ * const, size_type, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
@@ -120,7 +121,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_binop(Binop op,
                                                        host_span<size_type const> masks_begin_bits,
                                                        size_type mask_size_bits,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   auto dest_mask = rmm::device_buffer{bitmask_allocation_size_bytes(mask_size_bits), stream, mr};
   auto null_count =
@@ -163,7 +164,7 @@ size_type inplace_bitmask_binop(Binop op,
   CUDF_EXPECTS(std::all_of(masks.begin(), masks.end(), [](auto p) { return p != nullptr; }),
                "Mask pointer cannot be null");
 
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource();
   rmm::device_scalar<size_type> d_counter{0, stream, mr};
   rmm::device_uvector<bitmask_type const*> d_masks(masks.size(), stream, mr);
   rmm::device_uvector<size_type> d_begin_bits(masks_begin_bits.size(), stream, mr);
@@ -282,7 +283,7 @@ rmm::device_uvector<size_type> segmented_count_bits(bitmask_type const* bitmask,
                                                     OffsetIterator last_bit_indices_begin,
                                                     count_bits_policy count_bits,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   auto const num_ranges =
     static_cast<size_type>(std::distance(first_bit_indices_begin, first_bit_indices_end));
@@ -541,7 +542,7 @@ std::pair<rmm::device_buffer, size_type> segmented_null_mask_reduction(
   null_policy null_handling,
   std::optional<bool> valid_initial_value,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto const segments_begin =
     thrust::make_zip_iterator(first_bit_indices_begin, last_bit_indices_begin);
diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
index 74e2ccd2ea1..04d8d663acb 100644
--- a/cpp/include/cudf/detail/null_mask.hpp
+++ b/cpp/include/cudf/detail/null_mask.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <vector>
 
@@ -28,14 +29,14 @@ namespace cudf {
 namespace detail {
 
 /**
- * @copydoc cudf::create_null_mask(size_type, mask_state, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::create_null_mask(size_type, mask_state, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 rmm::device_buffer create_null_mask(size_type size,
                                     mask_state state,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::set_null_mask(bitmask_type*, size_type, size_type, bool)
@@ -194,7 +195,7 @@ std::vector<size_type> segmented_null_count(bitmask_type const* bitmask,
 
 /**
  * @copydoc cudf::copy_bitmask(bitmask_type const*, size_type, size_type,
- *rmm::mr::device_memory_resource*)
+ *rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -202,20 +203,20 @@ rmm::device_buffer copy_bitmask(bitmask_type const* mask,
                                 size_type begin_bit,
                                 size_type end_bit,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr);
+                                rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::copy_bitmask(column_view const& view, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::copy_bitmask(column_view const& view, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 rmm::device_buffer copy_bitmask(column_view const& view,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr);
+                                rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc bitmask_and(host_span<bitmask_type const* const>, host_span<size_type> const,
- * size_type, rmm::mr::device_memory_resource *)
+ * size_type, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
@@ -223,7 +224,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(host_span<bitmask_type cons
                                                      host_span<size_type const> masks_begin_bits,
                                                      size_type mask_size_bits,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr);
+                                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::bitmask_and
@@ -232,7 +233,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(host_span<bitmask_type cons
  */
 std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr);
+                                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::bitmask_or
@@ -241,7 +242,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
  */
 std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr);
+                                                    rmm::device_async_resource_ref mr);
 
 /**
  * @brief Performs a bitwise AND of the specified bitmasks,
@@ -274,7 +275,7 @@ cudf::size_type inplace_bitmask_and(device_span<bitmask_type> dest_mask,
 void set_all_valid_null_masks(column_view const& input,
                               column& output,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 
diff --git a/cpp/include/cudf/detail/quantiles.hpp b/cpp/include/cudf/detail/quantiles.hpp
index ac37d923d85..6c188d2ca68 100644
--- a/cpp/include/cudf/detail/quantiles.hpp
+++ b/cpp/include/cudf/detail/quantiles.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -35,7 +36,7 @@ std::unique_ptr<column> quantile(column_view const& input,
                                  column_view const& ordered_indices,
                                  bool exact,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::quantiles()
@@ -49,18 +50,18 @@ std::unique_ptr<table> quantiles(table_view const& input,
                                  std::vector<order> const& column_order,
                                  std::vector<null_order> const& null_precedence,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::percentile_approx(tdigest_column_view const&, column_view const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> percentile_approx(tdigest::tdigest_column_view const& input,
                                           column_view const& percentiles,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/repeat.hpp b/cpp/include/cudf/detail/repeat.hpp
index 883d5d158fb..abb9e45a95c 100644
--- a/cpp/include/cudf/detail/repeat.hpp
+++ b/cpp/include/cudf/detail/repeat.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -28,7 +29,7 @@ namespace detail {
 
 /**
  * @copydoc cudf::repeat(table_view const&, column_view const&, bool,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -36,18 +37,18 @@ std::unique_ptr<table> repeat(table_view const& input_table,
                               column_view const& count,
                               bool check_count,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::repeat(table_view const&, size_type,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> repeat(table_view const& input_table,
                               size_type count,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/replace.hpp b/cpp/include/cudf/detail/replace.hpp
index da83f7b285d..46203bdf2f0 100644
--- a/cpp/include/cudf/detail/replace.hpp
+++ b/cpp/include/cudf/detail/replace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -27,58 +28,58 @@ namespace cudf {
 namespace detail {
 /**
  * @copydoc cudf::replace_nulls(column_view const&, column_view const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> replace_nulls(column_view const& input,
                                       cudf::column_view const& replacement,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::replace_nulls(column_view const&, scalar const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> replace_nulls(column_view const& input,
                                       scalar const& replacement,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::replace_nulls(column_view const&, replace_policy const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> replace_nulls(column_view const& input,
                                       replace_policy const& replace_policy,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::replace_nans(column_view const&, column_view const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> replace_nans(column_view const& input,
                                      column_view const& replacement,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::replace_nans(column_view const&, scalar const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> replace_nans(column_view const& input,
                                      scalar const& replacement,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::find_and_replace_all
@@ -89,7 +90,7 @@ std::unique_ptr<column> find_and_replace_all(column_view const& input_col,
                                              column_view const& values_to_replace,
                                              column_view const& replacement_values,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::normalize_nans_and_zeros
@@ -98,7 +99,7 @@ std::unique_ptr<column> find_and_replace_all(column_view const& input_col,
  */
 std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr);
+                                                 rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/reshape.hpp b/cpp/include/cudf/detail/reshape.hpp
index 5ab53690a23..7a1c3d6c4f0 100644
--- a/cpp/include/cudf/detail/reshape.hpp
+++ b/cpp/include/cudf/detail/reshape.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -33,7 +34,7 @@ namespace detail {
 std::unique_ptr<table> tile(table_view const& input,
                             size_type count,
                             rmm::cuda_stream_view,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::interleave_columns
@@ -42,7 +43,7 @@ std::unique_ptr<table> tile(table_view const& input,
  */
 std::unique_ptr<column> interleave_columns(table_view const& input,
                                            rmm::cuda_stream_view,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/rolling.hpp b/cpp/include/cudf/detail/rolling.hpp
index da90217c254..ea6f38c421c 100644
--- a/cpp/include/cudf/detail/rolling.hpp
+++ b/cpp/include/cudf/detail/rolling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -35,7 +36,7 @@ namespace detail {
  *            column_view const& following_window,
  *            size_type min_periods,
  *            rolling_aggregation const& agg,
- *            rmm::mr::device_memory_resource* mr)
+ *            rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -45,7 +46,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/round.hpp b/cpp/include/cudf/detail/round.hpp
index cdfc7caef37..1a9c5c82c65 100644
--- a/cpp/include/cudf/detail/round.hpp
+++ b/cpp/include/cudf/detail/round.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 //! Inner interfaces and implementations
@@ -27,7 +28,7 @@ namespace detail {
 
 /**
  * @copydoc cudf::round(column_view const&, int32_t, rounding_method,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -35,7 +36,7 @@ std::unique_ptr<column> round(column_view const& input,
                               int32_t decimal_places,
                               rounding_method method,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/scan.hpp b/cpp/include/cudf/detail/scan.hpp
index f4b2d51d0cb..54c25d0157c 100644
--- a/cpp/include/cudf/detail/scan.hpp
+++ b/cpp/include/cudf/detail/scan.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -50,7 +51,7 @@ std::unique_ptr<column> scan_exclusive(column_view const& input,
                                        scan_aggregation const& agg,
                                        null_policy null_handling,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes the inclusive scan of a column.
@@ -76,7 +77,7 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        scan_aggregation const& agg,
                                        null_policy null_handling,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @brief Generate row ranks for a column.
@@ -88,7 +89,7 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
  */
 std::unique_ptr<column> inclusive_rank_scan(column_view const& order_by,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Generate row dense ranks for a column.
@@ -100,7 +101,7 @@ std::unique_ptr<column> inclusive_rank_scan(column_view const& order_by,
  */
 std::unique_ptr<column> inclusive_dense_rank_scan(column_view const& order_by,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr);
+                                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Generate row ONE_NORMALIZED percent ranks for a column.
@@ -113,7 +114,7 @@ std::unique_ptr<column> inclusive_dense_rank_scan(column_view const& order_by,
  * @return rank values.
  */
 std::unique_ptr<column> inclusive_one_normalized_percent_rank_scan(
-  column_view const& order_by, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+  column_view const& order_by, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
index dbf7bfa9527..7eb661f7833 100644
--- a/cpp/include/cudf/detail/scatter.cuh
+++ b/cpp/include/cudf/detail/scatter.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,6 +33,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/count.h>
 #include <thrust/distance.h>
@@ -145,7 +146,7 @@ struct column_scatterer_impl<Element, std::enable_if_t<cudf::is_fixed_width<Elem
                                      MapIterator scatter_map_end,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     auto result      = std::make_unique<column>(target, stream, mr);
     auto result_view = result->mutable_view();
@@ -170,7 +171,7 @@ struct column_scatterer_impl<string_view> {
                                      MapIterator scatter_map_end,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     auto d_column    = column_device_view::create(source, stream);
     auto const begin = d_column->begin<string_view>();
@@ -187,7 +188,7 @@ struct column_scatterer_impl<list_view> {
                                      MapIterator scatter_map_end,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     return cudf::lists::detail::scatter(
       source, scatter_map_begin, scatter_map_end, target, stream, mr);
@@ -202,7 +203,7 @@ struct column_scatterer_impl<dictionary32> {
                                      MapIterator scatter_map_end,
                                      column_view const& target_in,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     if (target_in.is_empty())  // empty begets empty
       return make_empty_column(type_id::DICTIONARY32);
@@ -261,7 +262,7 @@ struct column_scatterer {
                                      MapIterator scatter_map_end,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     column_scatterer_impl<Element> scatterer{};
     return scatterer(source, scatter_map_begin, scatter_map_end, target, stream, mr);
@@ -276,7 +277,7 @@ struct column_scatterer_impl<struct_view> {
                                      MapItRoot scatter_map_end,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     CUDF_EXPECTS(source.num_children() == target.num_children(),
                  "Scatter source and target are not of the same type.");
@@ -391,7 +392,7 @@ std::unique_ptr<table> scatter(table_view const& source,
                                MapIterator scatter_map_end,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/include/cudf/detail/scatter.hpp b/cpp/include/cudf/detail/scatter.hpp
index 94c795f31b2..95ed6af8c3c 100644
--- a/cpp/include/cudf/detail/scatter.hpp
+++ b/cpp/include/cudf/detail/scatter.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -63,11 +64,11 @@ std::unique_ptr<table> scatter(table_view const& source,
                                column_view const& scatter_map,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::detail::scatter(table_view const&,column_view const&,table_view
- * const&,bool,rmm::cuda_stream_view,rmm::mr::device_memory_resource*)
+ * const&,bool,rmm::cuda_stream_view,rmm::device_async_resource_ref)
  *
  * @throws cudf::logic_error if `scatter_map` span size is larger than max of `size_type`.
  */
@@ -75,7 +76,7 @@ std::unique_ptr<table> scatter(table_view const& source,
                                device_span<size_type const> const scatter_map,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 /**
  * @brief Scatters a row of scalar values into a copy of the target table
@@ -110,13 +111,13 @@ std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<scalar const>>
                                column_view const& indices,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::boolean_mask_scatter(
                       table_view const& source, table_view const& target,
  *                    column_view const& boolean_mask,
- *                    rmm::mr::device_memory_resource *mr)
+ *                    rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -124,14 +125,14 @@ std::unique_ptr<table> boolean_mask_scatter(table_view const& source,
                                             table_view const& target,
                                             column_view const& boolean_mask,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::boolean_mask_scatter(
  *                    std::vector<std::reference_wrapper<scalar>> const& source,
  *                    table_view const& target,
  *                    column_view const& boolean_mask,
- *                    rmm::mr::device_memory_resource *mr)
+ *                    rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -140,7 +141,7 @@ std::unique_ptr<table> boolean_mask_scatter(
   table_view const& target,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/search.hpp b/cpp/include/cudf/detail/search.hpp
index 4277baf3edd..e60b18f4c8d 100644
--- a/cpp/include/cudf/detail/search.hpp
+++ b/cpp/include/cudf/detail/search.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::detail {
 /**
@@ -35,7 +36,7 @@ std::unique_ptr<column> lower_bound(table_view const& haystack,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::upper_bound
@@ -47,24 +48,24 @@ std::unique_ptr<column> upper_bound(table_view const& haystack,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::contains(column_view const&, scalar const&, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::contains(column_view const&, scalar const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 bool contains(column_view const& haystack, scalar const& needle, rmm::cuda_stream_view stream);
 
 /**
- * @copydoc cudf::contains(column_view const&, column_view const&, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::contains(column_view const&, column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> contains(column_view const& haystack,
                                  column_view const& needles,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Check if rows in the given `needles` table exist in the `haystack` table.
@@ -96,6 +97,6 @@ rmm::device_uvector<bool> contains(table_view const& haystack,
                                    null_equality compare_nulls,
                                    nan_equality compare_nans,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/sequence.hpp b/cpp/include/cudf/detail/sequence.hpp
index 6f2a43b54de..a18a9d3b200 100644
--- a/cpp/include/cudf/detail/sequence.hpp
+++ b/cpp/include/cudf/detail/sequence.hpp
@@ -21,12 +21,13 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
 /**
  * @copydoc cudf::sequence(size_type size, scalar const& init, scalar const& step,
- *                                       rmm::mr::device_memory_resource* mr =
+ *                                       rmm::device_async_resource_ref mr =
  *rmm::mr::get_current_device_resource())
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
@@ -35,11 +36,11 @@ std::unique_ptr<column> sequence(size_type size,
                                  scalar const& init,
                                  scalar const& step,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::sequence(size_type size, scalar const& init,
-                                         rmm::mr::device_memory_resource* mr =
+                                         rmm::device_async_resource_ref mr =
  rmm::mr::get_current_device_resource())
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
@@ -47,13 +48,13 @@ std::unique_ptr<column> sequence(size_type size,
 std::unique_ptr<column> sequence(size_type size,
                                  scalar const& init,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::calendrical_month_sequence(size_type size,
  *                                           scalar const& init,
  *                                           size_type months,
- *                                           rmm::mr::device_memory_resource* mr)
+ *                                           rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -61,7 +62,7 @@ std::unique_ptr<cudf::column> calendrical_month_sequence(size_type size,
                                                          scalar const& init,
                                                          size_type months,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr);
+                                                         rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
index 08917bfce24..63e4fca8915 100644
--- a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
+++ b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/distance.h>
@@ -300,7 +301,7 @@ std::pair<std::unique_ptr<column>, size_type> make_offsets_child_column(
   InputIterator begin,
   InputIterator end,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto count          = static_cast<size_type>(std::distance(begin, end));
   auto offsets_column = make_numeric_column(
diff --git a/cpp/include/cudf/detail/sorting.hpp b/cpp/include/cudf/detail/sorting.hpp
index 97cc054da57..4ddba38a7e9 100644
--- a/cpp/include/cudf/detail/sorting.hpp
+++ b/cpp/include/cudf/detail/sorting.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -37,7 +38,7 @@ std::unique_ptr<column> sorted_order(table_view const& input,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::stable_sorted_order
@@ -48,7 +49,7 @@ std::unique_ptr<column> stable_sorted_order(table_view const& input,
                                             std::vector<order> const& column_order,
                                             std::vector<null_order> const& null_precedence,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::sort_by_key
@@ -60,7 +61,7 @@ std::unique_ptr<table> sort_by_key(table_view const& values,
                                    std::vector<order> const& column_order,
                                    std::vector<null_order> const& null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::rank
@@ -74,7 +75,7 @@ std::unique_ptr<column> rank(column_view const& input,
                              null_order null_precedence,
                              bool percentage,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr);
+                             rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::stable_sort_by_key
@@ -86,7 +87,7 @@ std::unique_ptr<table> stable_sort_by_key(table_view const& values,
                                           std::vector<order> const& column_order,
                                           std::vector<null_order> const& null_precedence,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::segmented_sorted_order
@@ -98,7 +99,7 @@ std::unique_ptr<column> segmented_sorted_order(table_view const& keys,
                                                std::vector<order> const& column_order,
                                                std::vector<null_order> const& null_precedence,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr);
+                                               rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::stable_segmented_sorted_order
@@ -111,7 +112,7 @@ std::unique_ptr<column> stable_segmented_sorted_order(
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::segmented_sort_by_key
@@ -124,7 +125,7 @@ std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
                                              std::vector<order> const& column_order,
                                              std::vector<null_order> const& null_precedence,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::stable_segmented_sort_by_key
@@ -137,7 +138,7 @@ std::unique_ptr<table> stable_segmented_sort_by_key(table_view const& values,
                                                     std::vector<order> const& column_order,
                                                     std::vector<null_order> const& null_precedence,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr);
+                                                    rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::sort
@@ -148,7 +149,7 @@ std::unique_ptr<table> sort(table_view const& values,
                             std::vector<order> const& column_order,
                             std::vector<null_order> const& null_precedence,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::stable_sort
@@ -159,7 +160,7 @@ std::unique_ptr<table> stable_sort(table_view const& values,
                                    std::vector<order> const& column_order,
                                    std::vector<null_order> const& null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp
index 7f366c06a1c..e2974789ea1 100644
--- a/cpp/include/cudf/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/detail/stream_compaction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,12 +23,13 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
 /**
  * @copydoc cudf::drop_nulls(table_view const&, std::vector<size_type> const&,
- *                           cudf::size_type, rmm::mr::device_memory_resource*)
+ *                           cudf::size_type, rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -36,11 +37,11 @@ std::unique_ptr<table> drop_nulls(table_view const& input,
                                   std::vector<size_type> const& keys,
                                   cudf::size_type keep_threshold,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::drop_nans(table_view const&, std::vector<size_type> const&,
- *                          cudf::size_type, rmm::mr::device_memory_resource*)
+ *                          cudf::size_type, rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -48,7 +49,7 @@ std::unique_ptr<table> drop_nans(table_view const& input,
                                  std::vector<size_type> const& keys,
                                  cudf::size_type keep_threshold,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::apply_boolean_mask
@@ -58,7 +59,7 @@ std::unique_ptr<table> drop_nans(table_view const& input,
 std::unique_ptr<table> apply_boolean_mask(table_view const& input,
                                           column_view const& boolean_mask,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::unique
@@ -70,7 +71,7 @@ std::unique_ptr<table> unique(table_view const& input,
                               duplicate_keep_option keep,
                               null_equality nulls_equal,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::distinct
@@ -83,7 +84,7 @@ std::unique_ptr<table> distinct(table_view const& input,
                                 null_equality nulls_equal,
                                 nan_equality nans_equal,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr);
+                                rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::stable_distinct
@@ -96,7 +97,7 @@ std::unique_ptr<table> stable_distinct(table_view const& input,
                                        null_equality nulls_equal,
                                        nan_equality nans_equal,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::distinct_indices
@@ -108,7 +109,7 @@ rmm::device_uvector<size_type> distinct_indices(table_view const& input,
                                                 null_equality nulls_equal,
                                                 nan_equality nans_equal,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr);
+                                                rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::unique_count(column_view const&, null_policy, nan_policy)
diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp
index c0a79142cef..e736514ac29 100644
--- a/cpp/include/cudf/detail/structs/utilities.hpp
+++ b/cpp/include/cudf/detail/structs/utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::structs::detail {
 
@@ -175,7 +176,7 @@ class flattened_table {
   std::vector<null_order> const& null_precedence,
   column_nullability nullability,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Superimpose nulls from a given null mask into the input column, using bitwise AND.
@@ -197,7 +198,7 @@ class flattened_table {
                                                         size_type null_count,
                                                         std::unique_ptr<column>&& input,
                                                         rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr);
+                                                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Push down nulls from the given input column into its children columns, using bitwise AND.
@@ -222,7 +223,7 @@ class flattened_table {
  *         to be kept alive.
  */
 [[nodiscard]] std::pair<column_view, temporary_nullable_data> push_down_nulls(
-  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+  column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
 /**
  * @brief Push down nulls from columns of the input table into their children columns, using
@@ -249,7 +250,7 @@ class flattened_table {
  *         to be kept alive.
  */
 [[nodiscard]] std::pair<table_view, temporary_nullable_data> push_down_nulls(
-  table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+  table_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
 /**
  * @brief Checks if a column or any of its children is a struct column with structs that are null.
diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
index b529d4a2c53..bfd12c18fff 100644
--- a/cpp/include/cudf/detail/tdigest/tdigest.hpp
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace tdigest {
@@ -70,7 +71,7 @@ std::unique_ptr<column> group_tdigest(column_view const& values,
                                       size_type num_groups,
                                       int max_centroids,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Merges tdigests within the same group to generate a new tdigest.
@@ -113,7 +114,7 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& values,
                                             size_type num_groups,
                                             int max_centroids,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create a tdigest column from its constituent components.
@@ -139,7 +140,7 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
                                             std::unique_ptr<column>&& min_values,
                                             std::unique_ptr<column>&& max_values,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create an empty tdigest column.
@@ -152,7 +153,7 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
  * @returns An empty tdigest column.
  */
 std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr);
+                                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create an empty tdigest scalar.
@@ -165,7 +166,7 @@ std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
  * @returns An empty tdigest scalar.
  */
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr);
+                                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Generate a tdigest scalar from a set of numeric input values.
@@ -199,7 +200,7 @@ std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
 std::unique_ptr<scalar> reduce_tdigest(column_view const& values,
                                        int max_centroids,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @brief Merges multiple tdigest columns to generate a new tdigest scalar.
@@ -233,7 +234,7 @@ std::unique_ptr<scalar> reduce_tdigest(column_view const& values,
 std::unique_ptr<scalar> reduce_merge_tdigest(column_view const& input,
                                              int max_centroids,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace tdigest
diff --git a/cpp/include/cudf/detail/timezone.hpp b/cpp/include/cudf/detail/timezone.hpp
index f7f97c0a7c2..037164aa297 100644
--- a/cpp/include/cudf/detail/timezone.hpp
+++ b/cpp/include/cudf/detail/timezone.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,12 +18,13 @@
 #include <cudf/timezone.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::detail {
 
 /**
  * @copydoc cudf::make_timezone_transition_table(std::optional<std::string_view>, std::string_view,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -31,6 +32,6 @@ std::unique_ptr<table> make_timezone_transition_table(
   std::optional<std::string_view> tzif_dir,
   std::string_view timezone_name,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp
index 965fea84860..47e13fa2e5e 100644
--- a/cpp/include/cudf/detail/transform.hpp
+++ b/cpp/include/cudf/detail/transform.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -34,7 +35,7 @@ std::unique_ptr<column> transform(column_view const& input,
                                   data_type output_type,
                                   bool is_ptx,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::compute_column
@@ -44,7 +45,7 @@ std::unique_ptr<column> transform(column_view const& input,
 std::unique_ptr<column> compute_column(table_view const& table,
                                        ast::expression const& expr,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::nans_to_nulls
@@ -52,7 +53,7 @@ std::unique_ptr<column> compute_column(table_view const& table,
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
-  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+  column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::bools_to_mask
@@ -60,7 +61,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
-  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+  column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::encode
@@ -68,7 +69,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
-  cudf::table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+  cudf::table_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::one_hot_encode
@@ -78,7 +79,7 @@ std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
 std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const& input,
                                                               column_view const& categories,
                                                               rmm::cuda_stream_view stream,
-                                                              rmm::mr::device_memory_resource* mr);
+                                                              rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::mask_to_bools
@@ -89,7 +90,7 @@ std::unique_ptr<column> mask_to_bools(bitmask_type const* null_mask,
                                       size_type begin_bit,
                                       size_type end_bit,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::row_bit_count
@@ -98,7 +99,7 @@ std::unique_ptr<column> mask_to_bools(bitmask_type const* null_mask,
  */
 std::unique_ptr<column> row_bit_count(table_view const& t,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::segmented_row_bit_count
@@ -108,7 +109,7 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
 std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
                                                 size_type segment_length,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr);
+                                                rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/transpose.hpp b/cpp/include/cudf/detail/transpose.hpp
index d0be51860b2..1f8effc8103 100644
--- a/cpp/include/cudf/detail/transpose.hpp
+++ b/cpp/include/cudf/detail/transpose.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -30,7 +31,7 @@ namespace detail {
  */
 std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr);
+                                                         rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp
index 12f864de572..5245cfdf079 100644
--- a/cpp/include/cudf/detail/unary.hpp
+++ b/cpp/include/cudf/detail/unary.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
@@ -50,7 +51,7 @@ std::unique_ptr<column> true_if(InputIterator begin,
                                 size_type size,
                                 Predicate p,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   auto output =
     make_numeric_column(data_type(type_id::BOOL8), size, mask_state::UNALLOCATED, stream, mr);
@@ -68,14 +69,14 @@ std::unique_ptr<column> true_if(InputIterator begin,
 std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
                                               cudf::unary_operator op,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr);
+                                              rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::is_valid
  */
 std::unique_ptr<cudf::column> is_valid(cudf::column_view const& input,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::cast
@@ -83,21 +84,21 @@ std::unique_ptr<cudf::column> is_valid(cudf::column_view const& input,
 std::unique_ptr<column> cast(column_view const& input,
                              data_type type,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr);
+                             rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::is_nan
  */
 std::unique_ptr<column> is_nan(cudf::column_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::is_not_nan
  */
 std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 90ad98741ad..293a4096c57 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/host_vector.h>
 
@@ -50,7 +51,7 @@ namespace detail {
 template <typename T>
 rmm::device_uvector<T> make_zeroed_device_uvector_async(std::size_t size,
                                                         rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> ret(size, stream, mr);
   CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value()));
@@ -71,7 +72,7 @@ rmm::device_uvector<T> make_zeroed_device_uvector_async(std::size_t size,
 template <typename T>
 rmm::device_uvector<T> make_zeroed_device_uvector_sync(std::size_t size,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> ret(size, stream, mr);
   CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value()));
@@ -94,7 +95,7 @@ rmm::device_uvector<T> make_zeroed_device_uvector_sync(std::size_t size,
 template <typename T>
 rmm::device_uvector<T> make_device_uvector_async(host_span<T const> source_data,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
   CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(),
@@ -123,7 +124,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, host_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
-  Container const& c, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  Container const& c, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   return make_device_uvector_async(host_span<typename Container::value_type const>{c}, stream, mr);
 }
@@ -143,7 +144,7 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
 template <typename T>
 rmm::device_uvector<T> make_device_uvector_async(device_span<T const> source_data,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
   CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(),
@@ -172,7 +173,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
-  Container const& c, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  Container const& c, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   return make_device_uvector_async(
     device_span<typename Container::value_type const>{c}, stream, mr);
@@ -193,7 +194,7 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
 template <typename T>
 rmm::device_uvector<T> make_device_uvector_sync(host_span<T const> source_data,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   auto ret = make_device_uvector_async(source_data, stream, mr);
   stream.synchronize();
@@ -218,7 +219,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, host_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
-  Container const& c, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  Container const& c, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   return make_device_uvector_sync(host_span<typename Container::value_type const>{c}, stream, mr);
 }
@@ -238,7 +239,7 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
 template <typename T>
 rmm::device_uvector<T> make_device_uvector_sync(device_span<T const> source_data,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   auto ret = make_device_uvector_async(source_data, stream, mr);
   stream.synchronize();
@@ -263,7 +264,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
-  Container const& c, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  Container const& c, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   return make_device_uvector_sync(device_span<typename Container::value_type const>{c}, stream, mr);
 }
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index d0073177445..66163d6059a 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 
@@ -90,7 +91,7 @@ std::pair<rmm::device_buffer, size_type> valid_if(InputIterator begin,
                                                   InputIterator end,
                                                   Predicate p,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(begin <= end, "Invalid range.");
 
diff --git a/cpp/include/cudf/dictionary/detail/concatenate.hpp b/cpp/include/cudf/dictionary/detail/concatenate.hpp
index d74429484ce..55f3825b3ec 100644
--- a/cpp/include/cudf/dictionary/detail/concatenate.hpp
+++ b/cpp/include/cudf/dictionary/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -39,7 +40,7 @@ namespace detail {
  */
 std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/detail/encode.hpp b/cpp/include/cudf/dictionary/detail/encode.hpp
index 2aad7dd80ed..3b5a3bbab56 100644
--- a/cpp/include/cudf/dictionary/detail/encode.hpp
+++ b/cpp/include/cudf/dictionary/detail/encode.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -54,7 +55,7 @@ namespace detail {
 std::unique_ptr<column> encode(column_view const& column,
                                data_type indices_type,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create a column by gathering the keys from the provided
@@ -73,7 +74,7 @@ std::unique_ptr<column> encode(column_view const& column,
  */
 std::unique_ptr<column> decode(dictionary_column_view const& dictionary_column,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 /**
  * @brief Return minimal integer type for the given number of elements.
diff --git a/cpp/include/cudf/dictionary/detail/merge.hpp b/cpp/include/cudf/dictionary/detail/merge.hpp
index cad495d0097..c4229690ff5 100644
--- a/cpp/include/cudf/dictionary/detail/merge.hpp
+++ b/cpp/include/cudf/dictionary/detail/merge.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -44,7 +45,7 @@ std::unique_ptr<column> merge(dictionary_column_view const& lcol,
                               dictionary_column_view const& rcol,
                               cudf::detail::index_vector const& row_order,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/detail/replace.hpp b/cpp/include/cudf/dictionary/detail/replace.hpp
index 0778baa84d6..81a91d57169 100644
--- a/cpp/include/cudf/dictionary/detail/replace.hpp
+++ b/cpp/include/cudf/dictionary/detail/replace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -42,7 +43,7 @@ namespace detail {
 std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
                                       dictionary_column_view const& replacement,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create a new dictionary column by replacing nulls with a
@@ -59,7 +60,7 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
 std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
                                       scalar const& replacement,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/detail/search.hpp b/cpp/include/cudf/dictionary/detail/search.hpp
index 62059306b9a..2563b96b214 100644
--- a/cpp/include/cudf/dictionary/detail/search.hpp
+++ b/cpp/include/cudf/dictionary/detail/search.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -27,14 +28,14 @@ namespace detail {
 
 /**
  * @copydoc cudf::dictionary::get_index(dictionary_column_view const&,scalar
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
                                   scalar const& key,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Get the index for a key if it were added to the given dictionary.
@@ -58,7 +59,7 @@ std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
 std::unique_ptr<scalar> get_insert_index(dictionary_column_view const& dictionary,
                                          scalar const& key,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/detail/update_keys.hpp b/cpp/include/cudf/dictionary/detail/update_keys.hpp
index 6fd743ad526..e8486a80afc 100644
--- a/cpp/include/cudf/dictionary/detail/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/detail/update_keys.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -35,7 +36,7 @@ namespace detail {
 std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& new_keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::dictionary::remove_keys(dictionary_column_view const&,column_view
@@ -46,7 +47,7 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
 std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_column,
                                     column_view const& keys_to_remove,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::dictionary::remove_unused_keys(dictionary_column_view
@@ -56,7 +57,7 @@ std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_col
  */
 std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& dictionary_column,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::dictionary::set_keys(dictionary_column_view
@@ -67,7 +68,7 @@ std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& diction
 std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc
@@ -78,7 +79,7 @@ std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column
 std::vector<std::unique_ptr<column>> match_dictionaries(
   cudf::host_span<dictionary_column_view const> input,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create new dictionaries that have keys merged from dictionary columns
@@ -100,9 +101,7 @@ std::vector<std::unique_ptr<column>> match_dictionaries(
  * @return New dictionary columns and updated cudf::table_views.
  */
 std::pair<std::vector<std::unique_ptr<column>>, std::vector<table_view>> match_dictionaries(
-  std::vector<table_view> tables,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  std::vector<table_view> tables, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/dictionary_factories.hpp b/cpp/include/cudf/dictionary/dictionary_factories.hpp
index 821981ad148..7cdfa3bf9e5 100644
--- a/cpp/include/cudf/dictionary/dictionary_factories.hpp
+++ b/cpp/include/cudf/dictionary/dictionary_factories.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 /**
@@ -65,8 +66,8 @@ namespace cudf {
 std::unique_ptr<column> make_dictionary_column(
   column_view const& keys_column,
   column_view const& indices_column,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a dictionary column by taking ownership of the provided keys
@@ -117,8 +118,8 @@ std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys_colu
 std::unique_ptr<column> make_dictionary_column(
   std::unique_ptr<column> keys_column,
   std::unique_ptr<column> indices_column,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/dictionary/encode.hpp b/cpp/include/cudf/dictionary/encode.hpp
index 959b785bf87..768e2be2b0d 100644
--- a/cpp/include/cudf/dictionary/encode.hpp
+++ b/cpp/include/cudf/dictionary/encode.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -59,9 +60,9 @@ namespace dictionary {
  */
 std::unique_ptr<column> encode(
   column_view const& column,
-  data_type indices_type              = data_type{type_id::UINT32},
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  data_type indices_type            = data_type{type_id::UINT32},
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a column by gathering the keys from the provided
@@ -80,8 +81,8 @@ std::unique_ptr<column> encode(
  */
 std::unique_ptr<column> decode(
   dictionary_column_view const& dictionary_column,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/search.hpp b/cpp/include/cudf/dictionary/search.hpp
index 1b72cf42acd..1dff6dc1d5d 100644
--- a/cpp/include/cudf/dictionary/search.hpp
+++ b/cpp/include/cudf/dictionary/search.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -44,8 +45,8 @@ namespace dictionary {
 std::unique_ptr<scalar> get_index(
   dictionary_column_view const& dictionary,
   scalar const& key,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/update_keys.hpp b/cpp/include/cudf/dictionary/update_keys.hpp
index 40504c22edd..ce7057359a1 100644
--- a/cpp/include/cudf/dictionary/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/update_keys.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -59,8 +60,8 @@ namespace dictionary {
 std::unique_ptr<column> add_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& new_keys,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a new dictionary column by removing the specified keys
@@ -91,8 +92,8 @@ std::unique_ptr<column> add_keys(
 std::unique_ptr<column> remove_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& keys_to_remove,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a new dictionary column by removing any keys
@@ -113,8 +114,8 @@ std::unique_ptr<column> remove_keys(
  */
 std::unique_ptr<column> remove_unused_keys(
   dictionary_column_view const& dictionary_column,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a new dictionary column by applying only the specified keys
@@ -147,8 +148,8 @@ std::unique_ptr<column> remove_unused_keys(
 std::unique_ptr<column> set_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& keys,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create new dictionaries that have keys merged from the input dictionaries.
@@ -163,8 +164,8 @@ std::unique_ptr<column> set_keys(
  */
 std::vector<std::unique_ptr<column>> match_dictionaries(
   cudf::host_span<dictionary_column_view const> input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace dictionary
diff --git a/cpp/include/cudf/filling.hpp b/cpp/include/cudf/filling.hpp
index 1268f488919..90139e8634a 100644
--- a/cpp/include/cudf/filling.hpp
+++ b/cpp/include/cudf/filling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -91,8 +92,8 @@ std::unique_ptr<column> fill(
   size_type begin,
   size_type end,
   scalar const& value,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Repeat rows of a Table.
@@ -125,8 +126,8 @@ std::unique_ptr<column> fill(
 std::unique_ptr<table> repeat(
   table_view const& input_table,
   column_view const& count,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Repeat rows of a Table.
@@ -150,8 +151,8 @@ std::unique_ptr<table> repeat(
 std::unique_ptr<table> repeat(
   table_view const& input_table,
   size_type count,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Fills a column with a sequence of value specified by an initial value and a step.
@@ -181,8 +182,8 @@ std::unique_ptr<column> sequence(
   size_type size,
   scalar const& init,
   scalar const& step,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Fills a column with a sequence of value specified by an initial value and a step of 1.
@@ -208,8 +209,8 @@ std::unique_ptr<column> sequence(
 std::unique_ptr<column> sequence(
   size_type size,
   scalar const& init,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Generate a sequence of timestamps beginning at `init` and incrementing by `months` for
@@ -239,8 +240,8 @@ std::unique_ptr<cudf::column> calendrical_month_sequence(
   size_type size,
   scalar const& init,
   size_type months,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp
index 1c31e8777a8..831ef68ed15 100644
--- a/cpp/include/cudf/groupby.hpp
+++ b/cpp/include/cudf/groupby.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <utility>
@@ -184,17 +185,17 @@ class groupby {
    */
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> aggregate(
     host_span<aggregation_request const> requests,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @copydoc aggregate(host_span<aggregation_request const>, rmm::mr::device_memory_resource*)
+   * @copydoc aggregate(host_span<aggregation_request const>, rmm::device_async_resource_ref)
    *
    * @param stream CUDA stream used for device memory operations and kernel launches.
    */
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> aggregate(
     host_span<aggregation_request const> requests,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
   /**
    * @brief Performs grouped scans on the specified values.
    *
@@ -248,7 +249,7 @@ class groupby {
    */
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> scan(
     host_span<scan_request const> requests,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Performs grouped shifts for specified values.
@@ -304,7 +305,7 @@ class groupby {
     table_view const& values,
     host_span<size_type const> offsets,
     std::vector<std::reference_wrapper<scalar const>> const& fill_values,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief The grouped data corresponding to a groupby operation on a set of values.
@@ -332,8 +333,8 @@ class groupby {
    * returned groups
    * @return A `groups` object representing grouped keys and values
    */
-  groups get_groups(cudf::table_view values             = {},
-                    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  groups get_groups(cudf::table_view values           = {},
+                    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Performs grouped replace nulls on @p value
@@ -373,7 +374,7 @@ class groupby {
   std::pair<std::unique_ptr<table>, std::unique_ptr<table>> replace_nulls(
     table_view const& values,
     host_span<cudf::replace_policy const> replace_policies,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
  private:
   table_view _keys;                                      ///< Keys that determine grouping
@@ -404,18 +405,18 @@ class groupby {
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> dispatch_aggregation(
     host_span<aggregation_request const> requests,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr);
+    rmm::device_async_resource_ref mr);
 
   // Sort-based groupby
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> sort_aggregate(
     host_span<aggregation_request const> requests,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr);
+    rmm::device_async_resource_ref mr);
 
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> sort_scan(
     host_span<scan_request const> requests,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr);
+    rmm::device_async_resource_ref mr);
 };
 /** @} */
 }  // namespace groupby
diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp
index 83962b50a10..3c2f6dfe0d5 100644
--- a/cpp/include/cudf/hashing.hpp
+++ b/cpp/include/cudf/hashing.hpp
@@ -19,6 +19,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -58,9 +59,9 @@ namespace hashing {
  */
 std::unique_ptr<column> murmurhash3_x86_32(
   table_view const& input,
-  uint32_t seed                       = DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  uint32_t seed                     = DEFAULT_HASH_SEED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the MurmurHash3 64-bit hash value of each row in the given table
@@ -77,9 +78,9 @@ std::unique_ptr<column> murmurhash3_x86_32(
  */
 std::unique_ptr<table> murmurhash3_x64_128(
   table_view const& input,
-  uint64_t seed                       = DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  uint64_t seed                     = DEFAULT_HASH_SEED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the MD5 hash value of each row in the given table
@@ -92,8 +93,8 @@ std::unique_ptr<table> murmurhash3_x64_128(
  */
 std::unique_ptr<column> md5(
   table_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the SHA-1 hash value of each row in the given table
@@ -106,8 +107,8 @@ std::unique_ptr<column> md5(
  */
 std::unique_ptr<column> sha1(
   table_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the SHA-224 hash value of each row in the given table
@@ -120,8 +121,8 @@ std::unique_ptr<column> sha1(
  */
 std::unique_ptr<column> sha224(
   table_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the SHA-256 hash value of each row in the given table
@@ -134,8 +135,8 @@ std::unique_ptr<column> sha224(
  */
 std::unique_ptr<column> sha256(
   table_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the SHA-384 hash value of each row in the given table
@@ -148,8 +149,8 @@ std::unique_ptr<column> sha256(
  */
 std::unique_ptr<column> sha384(
   table_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the SHA-512 hash value of each row in the given table
@@ -162,8 +163,8 @@ std::unique_ptr<column> sha384(
  */
 std::unique_ptr<column> sha512(
   table_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the XXHash_64 hash value of each row in the given table
@@ -179,9 +180,9 @@ std::unique_ptr<column> sha512(
  */
 std::unique_ptr<column> xxhash_64(
   table_view const& input,
-  uint64_t seed                       = DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  uint64_t seed                     = DEFAULT_HASH_SEED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace hashing
 
diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp
index 88a43a64638..77266ceb48f 100644
--- a/cpp/include/cudf/hashing/detail/hashing.hpp
+++ b/cpp/include/cudf/hashing/detail/hashing.hpp
@@ -19,6 +19,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cstddef>
 #include <functional>
@@ -30,41 +31,41 @@ namespace detail {
 std::unique_ptr<column> murmurhash3_x86_32(table_view const& input,
                                            uint32_t seed,
                                            rmm::cuda_stream_view,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 std::unique_ptr<table> murmurhash3_x64_128(table_view const& input,
                                            uint64_t seed,
                                            rmm::cuda_stream_view,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> md5(table_view const& input,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> sha1(table_view const& input,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr);
+                             rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> sha224(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> sha256(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> sha384(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> sha512(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> xxhash_64(table_view const& input,
                                   uint64_t seed,
                                   rmm::cuda_stream_view,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /* Copyright 2005-2014 Daniel James.
  *
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 871f48e3aac..dc4d66a8f6e 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -23,6 +23,8 @@
 #pragma nv_diag_suppress 611
 #pragma nv_diag_suppress 2810
 #endif
+#include <rmm/resource_ref.hpp>
+
 #include <arrow/api.h>
 #ifdef __CUDACC__
 #pragma nv_diag_default 611
@@ -70,7 +72,7 @@ namespace cudf {
  */
 std::unique_ptr<table> from_dlpack(
   DLManagedTensor const* managed_tensor,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Convert a cudf table into a DLPack DLTensor
@@ -92,7 +94,7 @@ std::unique_ptr<table> from_dlpack(
  */
 DLManagedTensor* to_dlpack(
   table_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 
@@ -224,8 +226,8 @@ unique_schema_t to_arrow_schema(cudf::table_view const& input,
  */
 unique_device_array_t to_arrow_device(
   cudf::table&& table,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `ArrowDeviceArray` from cudf column and metadata
@@ -253,8 +255,8 @@ unique_device_array_t to_arrow_device(
  */
 unique_device_array_t to_arrow_device(
   cudf::column&& col,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `cudf::table` from given arrow Table input
@@ -267,8 +269,8 @@ unique_device_array_t to_arrow_device(
 
 std::unique_ptr<table> from_arrow(
   arrow::Table const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `cudf::scalar` from given arrow Scalar input
@@ -281,8 +283,8 @@ std::unique_ptr<table> from_arrow(
 
 std::unique_ptr<cudf::scalar> from_arrow(
   arrow::Scalar const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/io/avro.hpp b/cpp/include/cudf/io/avro.hpp
index 89207302850..8bc74eb574c 100644
--- a/cpp/include/cudf/io/avro.hpp
+++ b/cpp/include/cudf/io/avro.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <string>
@@ -216,7 +217,7 @@ class avro_reader_options_builder {
  */
 table_with_metadata read_avro(
   avro_reader_options const& options,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace io
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index 435583e805d..fdceda40e92 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <string>
@@ -1315,8 +1316,8 @@ class csv_reader_options_builder {
  */
 table_with_metadata read_csv(
   csv_reader_options options,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 /**
@@ -1721,8 +1722,8 @@ class csv_writer_options_builder {
  * @param mr Device memory resource to use for device memory allocation
  */
 void write_csv(csv_writer_options const& options,
-               rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-               rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+               rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+               rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace io
diff --git a/cpp/include/cudf/io/detail/avro.hpp b/cpp/include/cudf/io/detail/avro.hpp
index fede8e62d9f..fe9f935d2cc 100644
--- a/cpp/include/cudf/io/detail/avro.hpp
+++ b/cpp/include/cudf/io/detail/avro.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace io {
@@ -39,7 +40,7 @@ namespace avro {
 table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
                               avro_reader_options const& options,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace avro
 }  // namespace detail
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 40ddcf385b0..50c1a7c163d 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/io/csv.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace io {
@@ -38,7 +39,7 @@ namespace csv {
 table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
                              csv_reader_options const& options,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr);
+                             rmm::device_async_resource_ref mr);
 
 /**
  * @brief Write an entire dataset to CSV format.
@@ -55,7 +56,7 @@ void write_csv(data_sink* sink,
                host_span<std::string const> column_names,
                csv_writer_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr);
+               rmm::device_async_resource_ref mr);
 
 }  // namespace csv
 }  // namespace detail
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 3f7f7e9bb32..cf8e23c2d93 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -19,6 +19,7 @@
 #include <cudf/io/json.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::io::json::detail {
 
@@ -35,7 +36,7 @@ namespace cudf::io::json::detail {
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& options,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 /**
  * @brief Write an entire dataset to JSON format.
@@ -50,7 +51,7 @@ void write_json(data_sink* sink,
                 table_view const& table,
                 json_writer_options const& options,
                 rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr);
+                rmm::device_async_resource_ref mr);
 
 /**
  * @brief Normalize single quotes to double quotes using FST
@@ -61,7 +62,7 @@ void write_json(data_sink* sink,
  */
 rmm::device_uvector<char> normalize_single_quotes(rmm::device_uvector<char>&& inbuf,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr);
+                                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Normalize unquoted whitespace (space and tab characters) using FST
@@ -72,5 +73,5 @@ rmm::device_uvector<char> normalize_single_quotes(rmm::device_uvector<char>&& in
  */
 rmm::device_uvector<char> normalize_whitespace(rmm::device_uvector<char>&& inbuf,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr);
+                                               rmm::device_async_resource_ref mr);
 }  // namespace cudf::io::json::detail
diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index c63c952e148..9aeb9ae4267 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -23,6 +23,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <string>
@@ -57,7 +58,7 @@ class reader {
   explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                   orc_reader_options const& options,
                   rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
+                  rmm::device_async_resource_ref mr);
 
   /**
    * @brief Destructor explicitly declared to avoid inlining in header
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index df870f6f1e4..55338d422ad 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <string>
 #include <vector>
@@ -65,7 +66,7 @@ class reader {
   explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                   parquet_reader_options const& options,
                   rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
+                  rmm::device_async_resource_ref mr);
 
   /**
    * @brief Destructor explicitly-declared to avoid inlined in header
@@ -145,7 +146,7 @@ class chunked_reader : private reader {
                           std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                           parquet_reader_options const& options,
                           rmm::cuda_stream_view stream,
-                          rmm::mr::device_memory_resource* mr);
+                          rmm::device_async_resource_ref mr);
 
   /**
    * @brief Destructor explicitly-declared to avoid inlined in header.
diff --git a/cpp/include/cudf/io/detail/tokenize_json.hpp b/cpp/include/cudf/io/detail/tokenize_json.hpp
index b2ea29a85c3..d08c4e7c65a 100644
--- a/cpp/include/cudf/io/detail/tokenize_json.hpp
+++ b/cpp/include/cudf/io/detail/tokenize_json.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::io::json {
 
@@ -133,7 +134,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   device_span<SymbolT const> json_in,
   cudf::io::json_reader_options const& options,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 1f2628deea7..d8330b78f0e 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -22,6 +22,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <map>
 #include <string>
@@ -612,8 +613,8 @@ class json_reader_options_builder {
  */
 table_with_metadata read_json(
   json_reader_options options,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 
@@ -959,8 +960,8 @@ class json_writer_options_builder {
  * @param mr Device memory resource to use for device memory allocation
  */
 void write_json(json_writer_options const& options,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace io
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 5cc9ea81f29..bceb258cb38 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -22,6 +22,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <optional>
@@ -402,8 +403,8 @@ class orc_reader_options_builder {
  */
 table_with_metadata read_orc(
   orc_reader_options const& options,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 /**
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index dc035db8d39..f58bc48a37d 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -23,6 +23,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <iostream>
 #include <memory>
@@ -409,8 +410,8 @@ class parquet_reader_options_builder {
  */
 table_with_metadata read_parquet(
   parquet_reader_options const& options,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief The chunked parquet reader class to read Parquet file iteratively in to a series of
@@ -446,8 +447,8 @@ class chunked_parquet_reader {
   chunked_parquet_reader(
     std::size_t chunk_read_limit,
     parquet_reader_options const& options,
-    rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Constructor for chunked reader.
@@ -472,8 +473,8 @@ class chunked_parquet_reader {
     std::size_t chunk_read_limit,
     std::size_t pass_read_limit,
     parquet_reader_options const& options,
-    rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Destructor, destroying the internal reader instance.
diff --git a/cpp/include/cudf/io/text/detail/tile_state.hpp b/cpp/include/cudf/io/text/detail/tile_state.hpp
index d42624aa9b7..aa9185b4983 100644
--- a/cpp/include/cudf/io/text/detail/tile_state.hpp
+++ b/cpp/include/cudf/io/text/detail/tile_state.hpp
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <rmm/resource_ref.hpp>
+
 #include <cub/block/block_scan.cuh>
 #include <cuda/atomic>
 
@@ -81,7 +83,7 @@ struct scan_tile_state {
 
   scan_tile_state(cudf::size_type num_tiles,
                   rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr)
+                  rmm::device_async_resource_ref mr)
     : tile_status(rmm::device_uvector<cuda::atomic<scan_tile_status, cuda::thread_scope_device>>(
         num_tiles, stream, mr)),
       tile_state_partial(rmm::device_uvector<T>(num_tiles, stream, mr)),
diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp
index 7bb2e4e2ece..e0b9c7635e3 100644
--- a/cpp/include/cudf/io/text/detail/trie.hpp
+++ b/cpp/include/cudf/io/text/detail/trie.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <algorithm>
 #include <queue>
@@ -165,7 +166,7 @@ struct trie {
    */
   static trie create(std::string const& pattern,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr)
+                     rmm::device_async_resource_ref mr)
 
   {
     return create(std::vector<std::string>{pattern}, stream, mr);
@@ -181,7 +182,7 @@ struct trie {
    */
   static trie create(std::vector<std::string> const& patterns,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr)
+                     rmm::device_async_resource_ref mr)
   {
     std::vector<char> tokens;
     std::vector<uint8_t> transitions;
diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp
index a7edc9be0e4..7abae7c754b 100644
--- a/cpp/include/cudf/io/text/multibyte_split.hpp
+++ b/cpp/include/cudf/io/text/multibyte_split.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <optional>
@@ -85,18 +86,18 @@ struct parse_options {
 std::unique_ptr<cudf::column> multibyte_split(
   data_chunk_source const& source,
   std::string const& delimiter,
-  parse_options options               = {},
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  parse_options options             = {},
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::column> multibyte_split(
   data_chunk_source const& source,
   std::string const& delimiter,
   std::optional<byte_range_info> byte_range,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::column> multibyte_split(data_chunk_source const& source,
                                               std::string const& delimiter,
-                                              rmm::mr::device_memory_resource* mr);
+                                              rmm::device_async_resource_ref mr);
 
 }  // namespace text
 }  // namespace io
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index e343ad9ee32..825f758adbd 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -26,6 +26,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 #include <utility>
@@ -95,8 +96,8 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 inner_join(cudf::table_view const& left_keys,
            cudf::table_view const& right_keys,
-           null_equality compare_nulls         = null_equality::EQUAL,
-           rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+           null_equality compare_nulls       = null_equality::EQUAL,
+           rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to a
@@ -135,8 +136,8 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 left_join(cudf::table_view const& left_keys,
           cudf::table_view const& right_keys,
-          null_equality compare_nulls         = null_equality::EQUAL,
-          rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+          null_equality compare_nulls       = null_equality::EQUAL,
+          rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to a
@@ -174,8 +175,8 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 full_join(cudf::table_view const& left_keys,
           cudf::table_view const& right_keys,
-          null_equality compare_nulls         = null_equality::EQUAL,
-          rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+          null_equality compare_nulls       = null_equality::EQUAL,
+          rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a vector of row indices corresponding to a left semi-join
@@ -202,8 +203,8 @@ full_join(cudf::table_view const& left_keys,
 std::unique_ptr<rmm::device_uvector<size_type>> left_semi_join(
   cudf::table_view const& left_keys,
   cudf::table_view const& right_keys,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a vector of row indices corresponding to a left anti join
@@ -233,8 +234,8 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_semi_join(
 std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
   cudf::table_view const& left_keys,
   cudf::table_view const& right_keys,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a cross join on two tables (`left`, `right`)
@@ -261,7 +262,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
 std::unique_ptr<cudf::table> cross_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief The enum class to specify if any of the input join tables (`build` table and any later
@@ -340,7 +341,7 @@ class hash_join {
   inner_join(cudf::table_view const& probe,
              std::optional<std::size_t> output_size = {},
              rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-             rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource()) const;
+             rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource()) const;
 
   /**
    * Returns the row indices that can be used to construct the result of performing
@@ -365,7 +366,7 @@ class hash_join {
   left_join(cudf::table_view const& probe,
             std::optional<std::size_t> output_size = {},
             rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-            rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource()) const;
+            rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource()) const;
 
   /**
    * Returns the row indices that can be used to construct the result of performing
@@ -390,7 +391,7 @@ class hash_join {
   full_join(cudf::table_view const& probe,
             std::optional<std::size_t> output_size = {},
             rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-            rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource()) const;
+            rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource()) const;
 
   /**
    * Returns the exact number of matches (rows) when performing an inner join with the specified
@@ -441,8 +442,8 @@ class hash_join {
    */
   std::size_t full_join_size(
     cudf::table_view const& probe,
-    rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
  private:
   const std::unique_ptr<impl_type const> _impl;
@@ -497,8 +498,8 @@ class distinct_hash_join {
    */
   std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
             std::unique_ptr<rmm::device_uvector<size_type>>>
-  inner_join(rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-             rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
+  inner_join(rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+             rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
   /**
    * @brief Returns the build table indices that can be used to construct the result of performing
@@ -515,8 +516,8 @@ class distinct_hash_join {
    * join between two tables with `build` and `probe` as the join keys.
    */
   std::unique_ptr<rmm::device_uvector<size_type>> left_join(
-    rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
  private:
   using impl_type = typename cudf::detail::distinct_hash_join<HasNested>;  ///< Implementation type
@@ -561,12 +562,11 @@ class distinct_hash_join {
  */
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-conditional_inner_join(
-  table_view const& left,
-  table_view const& right,
-  ast::expression const& binary_predicate,
-  std::optional<std::size_t> output_size = {},
-  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+conditional_inner_join(table_view const& left,
+                       table_view const& right,
+                       ast::expression const& binary_predicate,
+                       std::optional<std::size_t> output_size = {},
+                       rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs
@@ -611,7 +611,7 @@ conditional_left_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
                       std::optional<std::size_t> output_size = {},
-                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                      rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs
@@ -653,7 +653,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 conditional_full_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
-                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                      rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns an index vector corresponding to all rows in the left table
@@ -692,7 +692,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size = {},
-  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns an index vector corresponding to all rows in the left table
@@ -731,7 +731,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size = {},
-  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs of
@@ -789,7 +789,7 @@ mixed_inner_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls = null_equality::EQUAL,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs of
@@ -849,7 +849,7 @@ mixed_left_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls = null_equality::EQUAL,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs of
@@ -909,7 +909,7 @@ mixed_full_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls = null_equality::EQUAL,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns an index vector corresponding to all rows in the left tables
@@ -955,8 +955,8 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
   table_view const& left_conditional,
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns an index vector corresponding to all rows in the left tables
@@ -1003,8 +1003,8 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
   table_view const& left_conditional,
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1043,8 +1043,8 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_in
   table_view const& left_conditional,
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1083,8 +1083,8 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
   table_view const& left_conditional,
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1107,7 +1107,7 @@ std::size_t conditional_inner_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1130,7 +1130,7 @@ std::size_t conditional_left_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1153,7 +1153,7 @@ std::size_t conditional_left_semi_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1176,6 +1176,6 @@ std::size_t conditional_left_anti_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/json/json.hpp b/cpp/include/cudf/json/json.hpp
index 944e0c26dd6..385e8e54bdc 100644
--- a/cpp/include/cudf/json/json.hpp
+++ b/cpp/include/cudf/json/json.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/optional.h>
 
@@ -167,9 +168,9 @@ class get_json_object_options {
 std::unique_ptr<cudf::column> get_json_object(
   cudf::strings_column_view const& col,
   cudf::string_scalar const& json_path,
-  get_json_object_options options     = get_json_object_options{},
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  get_json_object_options options   = get_json_object_options{},
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace cudf
diff --git a/cpp/include/cudf/labeling/label_bins.hpp b/cpp/include/cudf/labeling/label_bins.hpp
index d8ea262dfe1..9091e31a9ea 100644
--- a/cpp/include/cudf/labeling/label_bins.hpp
+++ b/cpp/include/cudf/labeling/label_bins.hpp
@@ -22,6 +22,7 @@
 
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -74,8 +75,8 @@ std::unique_ptr<column> label_bins(
   inclusive left_inclusive,
   column_view const& right_edges,
   inclusive right_inclusive,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/lists/combine.hpp b/cpp/include/cudf/lists/combine.hpp
index 0d9c1c157eb..853562acfff 100644
--- a/cpp/include/cudf/lists/combine.hpp
+++ b/cpp/include/cudf/lists/combine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -66,7 +67,7 @@ std::unique_ptr<column> concatenate_rows(
   table_view const& input,
   concatenate_null_policy null_policy = concatenate_null_policy::IGNORE,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Concatenating multiple lists on the same row of a lists column into a single list.
@@ -97,7 +98,7 @@ std::unique_ptr<column> concatenate_list_elements(
   column_view const& input,
   concatenate_null_policy null_policy = concatenate_null_policy::IGNORE,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/contains.hpp b/cpp/include/cudf/lists/contains.hpp
index 7cf67ec9205..060882555aa 100644
--- a/cpp/include/cudf/lists/contains.hpp
+++ b/cpp/include/cudf/lists/contains.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -49,8 +50,8 @@ namespace lists {
 std::unique_ptr<column> contains(
   cudf::lists_column_view const& lists,
   cudf::scalar const& search_key,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a column of `bool` values indicating whether the list rows of the first
@@ -73,8 +74,8 @@ std::unique_ptr<column> contains(
 std::unique_ptr<column> contains(
   cudf::lists_column_view const& lists,
   cudf::column_view const& search_keys,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a column of `bool` values indicating whether each row in the `lists` column
@@ -95,8 +96,8 @@ std::unique_ptr<column> contains(
  */
 std::unique_ptr<column> contains_nulls(
   cudf::lists_column_view const& lists,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Option to choose whether `index_of()` returns the first or last match
@@ -138,9 +139,9 @@ enum class duplicate_find_option : int32_t {
 std::unique_ptr<column> index_of(
   cudf::lists_column_view const& lists,
   cudf::scalar const& search_key,
-  duplicate_find_option find_option   = duplicate_find_option::FIND_FIRST,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  duplicate_find_option find_option = duplicate_find_option::FIND_FIRST,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a column of values indicating the position of a search key
@@ -175,9 +176,9 @@ std::unique_ptr<column> index_of(
 std::unique_ptr<column> index_of(
   cudf::lists_column_view const& lists,
   cudf::column_view const& search_keys,
-  duplicate_find_option find_option   = duplicate_find_option::FIND_FIRST,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  duplicate_find_option find_option = duplicate_find_option::FIND_FIRST,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/count_elements.hpp b/cpp/include/cudf/lists/count_elements.hpp
index e4bd0dca9ae..2b9f5aa5607 100644
--- a/cpp/include/cudf/lists/count_elements.hpp
+++ b/cpp/include/cudf/lists/count_elements.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -51,8 +52,8 @@ namespace lists {
  */
 std::unique_ptr<column> count_elements(
   lists_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of lists_elements group
 
diff --git a/cpp/include/cudf/lists/detail/combine.hpp b/cpp/include/cudf/lists/detail/combine.hpp
index 4bc45e48a9f..bd4c01bbb4b 100644
--- a/cpp/include/cudf/lists/detail/combine.hpp
+++ b/cpp/include/cudf/lists/detail/combine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/lists/combine.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace lists {
 namespace detail {
@@ -30,7 +32,7 @@ namespace detail {
 std::unique_ptr<column> concatenate_rows(table_view const& input,
                                          concatenate_null_policy null_policy,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::lists::concatenate_list_elements
@@ -40,7 +42,7 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
 std::unique_ptr<column> concatenate_list_elements(column_view const& input,
                                                   concatenate_null_policy null_policy,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr);
+                                                  rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/concatenate.hpp b/cpp/include/cudf/lists/detail/concatenate.hpp
index a1f149d4ccf..d67958ef260 100644
--- a/cpp/include/cudf/lists/detail/concatenate.hpp
+++ b/cpp/include/cudf/lists/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -45,7 +46,7 @@ namespace detail {
  */
 std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/contains.hpp b/cpp/include/cudf/lists/detail/contains.hpp
index 58ec18cb9ef..638cc7afb81 100644
--- a/cpp/include/cudf/lists/detail/contains.hpp
+++ b/cpp/include/cudf/lists/detail/contains.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 #include <cudf/lists/contains.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace lists {
 namespace detail {
@@ -26,49 +28,49 @@ namespace detail {
  * @copydoc cudf::lists::index_of(cudf::lists_column_view const&,
  *                                cudf::scalar const&,
  *                                duplicate_find_option,
- *                                rmm::mr::device_memory_resource*)
+ *                                rmm::device_async_resource_ref)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> index_of(cudf::lists_column_view const& lists,
                                  cudf::scalar const& search_key,
                                  cudf::lists::duplicate_find_option find_option,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::lists::index_of(cudf::lists_column_view const&,
  *                                cudf::column_view const&,
  *                                duplicate_find_option,
- *                                rmm::mr::device_memory_resource*)
+ *                                rmm::device_async_resource_ref)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> index_of(cudf::lists_column_view const& lists,
                                  cudf::column_view const& search_keys,
                                  cudf::lists::duplicate_find_option find_option,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::lists::contains(cudf::lists_column_view const&,
  *                                cudf::scalar const&,
- *                                rmm::mr::device_memory_resource*)
+ *                                rmm::device_async_resource_ref)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
                                  cudf::scalar const& search_key,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::lists::contains(cudf::lists_column_view const&,
  *                                cudf::column_view const&,
- *                                rmm::mr::device_memory_resource*)
+ *                                rmm::device_async_resource_ref)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
                                  cudf::column_view const& search_keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 }  // namespace detail
 }  // namespace lists
 }  // namespace cudf
diff --git a/cpp/include/cudf/lists/detail/copying.hpp b/cpp/include/cudf/lists/detail/copying.hpp
index 3760294f079..18a70bba5e9 100644
--- a/cpp/include/cudf/lists/detail/copying.hpp
+++ b/cpp/include/cudf/lists/detail/copying.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -46,7 +47,7 @@ std::unique_ptr<cudf::column> copy_slice(lists_column_view const& lists,
                                          size_type start,
                                          size_type end,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/extract.hpp b/cpp/include/cudf/lists/detail/extract.hpp
index 013f9b491dd..6f983d44bc9 100644
--- a/cpp/include/cudf/lists/detail/extract.hpp
+++ b/cpp/include/cudf/lists/detail/extract.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,29 +18,31 @@
 #include <cudf/lists/extract.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace lists {
 namespace detail {
 
 /**
  * @copydoc cudf::lists::extract_list_element(lists_column_view, size_type,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
                                              size_type const index,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::lists::extract_list_element(lists_column_view, column_view const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
                                              column_view const& indices,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh
index 03428bc347f..0cd77556f33 100644
--- a/cpp/include/cudf/lists/detail/gather.cuh
+++ b/cpp/include/cudf/lists/detail/gather.cuh
@@ -25,6 +25,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/functional.h>
@@ -59,7 +60,7 @@ struct gather_data {
  *                                 MapItType gather_map,
  *                                 size_type gather_map_size,
  *                                 rmm::cuda_stream_view stream,
- *                                 rmm::mr::device_memory_resource* mr)
+ *                                 rmm::device_async_resource_ref mr)
  *
  * @param prev_base_offsets The buffer backing the base offsets used in the gather map. We can
  *                          free this buffer before allocating the new one to keep peak memory
@@ -71,7 +72,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
                              size_type gather_map_size,
                              rmm::device_uvector<int32_t>&& prev_base_offsets,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   // size of the gather map is the # of output rows
   size_type output_count = gather_map_size;
@@ -252,7 +253,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
                              MapItType gather_map,
                              size_type gather_map_size,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   return make_gather_data<NullifyOutOfBounds, MapItType>(
     source_column,
@@ -278,7 +279,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
 std::unique_ptr<column> gather_list_nested(lists_column_view const& list,
                                            gather_data& gd,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
  * @brief Gather a leaf column from a hierarchy of list columns.
@@ -295,13 +296,13 @@ std::unique_ptr<column> gather_list_nested(lists_column_view const& list,
 std::unique_ptr<column> gather_list_leaf(column_view const& column,
                                          gather_data const& gd,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::lists::segmented_gather(lists_column_view const& source_column,
  *                                        lists_column_view const& gather_map_list,
  *                                        out_of_bounds_policy bounds_policy,
- *                                        rmm::mr::device_memory_resource* mr)
+ *                                        rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream on which to execute kernels
  */
@@ -309,7 +310,7 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& source_column,
                                          lists_column_view const& gather_map_list,
                                          out_of_bounds_policy bounds_policy,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/interleave_columns.hpp b/cpp/include/cudf/lists/detail/interleave_columns.hpp
index a5cf67c95b9..3aff93840a9 100644
--- a/cpp/include/cudf/lists/detail/interleave_columns.hpp
+++ b/cpp/include/cudf/lists/detail/interleave_columns.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -47,7 +48,7 @@ namespace detail {
 std::unique_ptr<column> interleave_columns(table_view const& input,
                                            bool has_null_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/lists_column_factories.hpp b/cpp/include/cudf/lists/detail/lists_column_factories.hpp
index 7b821a00b0d..192aee8d811 100644
--- a/cpp/include/cudf/lists/detail/lists_column_factories.hpp
+++ b/cpp/include/cudf/lists/detail/lists_column_factories.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -38,7 +39,7 @@ namespace detail {
 std::unique_ptr<cudf::column> make_lists_column_from_scalar(list_scalar const& value,
                                                             size_type size,
                                                             rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource* mr);
+                                                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create an empty lists column.
@@ -51,7 +52,7 @@ std::unique_ptr<cudf::column> make_lists_column_from_scalar(list_scalar const& v
  */
 std::unique_ptr<column> make_empty_lists_column(data_type child_type,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr);
+                                                rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create a lists column with all null rows.
@@ -64,7 +65,7 @@ std::unique_ptr<column> make_empty_lists_column(data_type child_type,
 std::unique_ptr<column> make_all_nulls_lists_column(size_type size,
                                                     data_type child_type,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr);
+                                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/reverse.hpp b/cpp/include/cudf/lists/detail/reverse.hpp
index 6e3b952a3b0..d099a0708b9 100644
--- a/cpp/include/cudf/lists/detail/reverse.hpp
+++ b/cpp/include/cudf/lists/detail/reverse.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 
 #include <cudf/lists/reverse.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf::lists::detail {
 
 /**
@@ -25,6 +27,6 @@ namespace cudf::lists::detail {
  */
 std::unique_ptr<column> reverse(lists_column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr);
+                                rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::lists::detail
diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index 5fc52ff1c04..d0d5b1ad823 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -30,6 +30,7 @@
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/distance.h>
@@ -53,7 +54,7 @@ rmm::device_uvector<unbound_list_view> list_vector_from_column(
   IndexIterator index_begin,
   IndexIterator index_end,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto n_rows = thrust::distance(index_begin, index_end);
 
@@ -98,7 +99,7 @@ std::unique_ptr<column> scatter_impl(rmm::device_uvector<unbound_list_view> cons
                                      column_view const& source,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(column_types_equal(source, target), "Mismatched column types.");
 
@@ -177,7 +178,7 @@ std::unique_ptr<column> scatter(column_view const& source,
                                 MapIterator scatter_map_end,
                                 column_view const& target,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   auto const num_rows = target.size();
   if (num_rows == 0) { return cudf::empty_like(target); }
@@ -233,7 +234,7 @@ std::unique_ptr<column> scatter(scalar const& slr,
                                 MapIterator scatter_map_end,
                                 column_view const& target,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   auto const num_rows = target.size();
   if (num_rows == 0) { return cudf::empty_like(target); }
diff --git a/cpp/include/cudf/lists/detail/scatter_helper.cuh b/cpp/include/cudf/lists/detail/scatter_helper.cuh
index 605f76871b5..fc44e0bc290 100644
--- a/cpp/include/cudf/lists/detail/scatter_helper.cuh
+++ b/cpp/include/cudf/lists/detail/scatter_helper.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -136,7 +137,7 @@ std::unique_ptr<column> build_lists_child_column_recursive(
   cudf::lists_column_view const& source_lists_column_view,
   cudf::lists_column_view const& target_lists_column_view,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/set_operations.hpp b/cpp/include/cudf/lists/detail/set_operations.hpp
index 51fc58bee07..8746b1ba62a 100644
--- a/cpp/include/cudf/lists/detail/set_operations.hpp
+++ b/cpp/include/cudf/lists/detail/set_operations.hpp
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::lists::detail {
 
@@ -35,7 +36,7 @@ std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
                                      null_equality nulls_equal,
                                      nan_equality nans_equal,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::list::intersect_distinct
@@ -47,7 +48,7 @@ std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
                                            null_equality nulls_equal,
                                            nan_equality nans_equal,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::list::union_distinct
@@ -59,7 +60,7 @@ std::unique_ptr<column> union_distinct(lists_column_view const& lhs,
                                        null_equality nulls_equal,
                                        nan_equality nans_equal,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::list::difference_distinct
@@ -71,7 +72,7 @@ std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
                                             null_equality nulls_equal,
                                             nan_equality nans_equal,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /** @} */  // end of group
 }  // namespace cudf::lists::detail
diff --git a/cpp/include/cudf/lists/detail/sorting.hpp b/cpp/include/cudf/lists/detail/sorting.hpp
index c378ca8cf06..e428ea84ce6 100644
--- a/cpp/include/cudf/lists/detail/sorting.hpp
+++ b/cpp/include/cudf/lists/detail/sorting.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -32,7 +33,7 @@ std::unique_ptr<column> sort_lists(lists_column_view const& input,
                                    order column_order,
                                    null_order null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::lists::stable_sort_lists
@@ -43,7 +44,7 @@ std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
                                           order column_order,
                                           null_order null_precedence,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/stream_compaction.hpp b/cpp/include/cudf/lists/detail/stream_compaction.hpp
index 7ab9cf9a343..f5e5b29bc8f 100644
--- a/cpp/include/cudf/lists/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/lists/detail/stream_compaction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,19 +19,20 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::lists::detail {
 
 /**
  * @copydoc cudf::lists::apply_boolean_mask(lists_column_view const&, lists_column_view const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
                                            lists_column_view const& boolean_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::list::distinct
@@ -42,6 +43,6 @@ std::unique_ptr<column> distinct(lists_column_view const& input,
                                  null_equality nulls_equal,
                                  nan_equality nans_equal,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::lists::detail
diff --git a/cpp/include/cudf/lists/explode.hpp b/cpp/include/cudf/lists/explode.hpp
index adf46805855..81d82dcfa09 100644
--- a/cpp/include/cudf/lists/explode.hpp
+++ b/cpp/include/cudf/lists/explode.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -72,7 +73,7 @@ namespace cudf {
 std::unique_ptr<table> explode(
   table_view const& input_table,
   size_type explode_column_idx,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Explodes a list column's elements and includes a position column.
@@ -116,7 +117,7 @@ std::unique_ptr<table> explode(
 std::unique_ptr<table> explode_position(
   table_view const& input_table,
   size_type explode_column_idx,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Explodes a list column's elements retaining any null entries or empty lists inside.
@@ -158,7 +159,7 @@ std::unique_ptr<table> explode_position(
 std::unique_ptr<table> explode_outer(
   table_view const& input_table,
   size_type explode_column_idx,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Explodes a list column's elements retaining any null entries or empty lists and includes a
@@ -202,7 +203,7 @@ std::unique_ptr<table> explode_outer(
 std::unique_ptr<table> explode_outer_position(
   table_view const& input_table,
   size_type explode_column_idx,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/lists/extract.hpp b/cpp/include/cudf/lists/extract.hpp
index 14c0f59e17d..096d276fcfb 100644
--- a/cpp/include/cudf/lists/extract.hpp
+++ b/cpp/include/cudf/lists/extract.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -66,8 +67,8 @@ namespace lists {
 std::unique_ptr<column> extract_list_element(
   lists_column_view const& lists_column,
   size_type index,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a column where each row is a single element from the corresponding sublist
@@ -107,8 +108,8 @@ std::unique_ptr<column> extract_list_element(
 std::unique_ptr<column> extract_list_element(
   lists_column_view const& lists_column,
   column_view const& indices,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/filling.hpp b/cpp/include/cudf/lists/filling.hpp
index 3730e16482d..1d840c76bf8 100644
--- a/cpp/include/cudf/lists/filling.hpp
+++ b/cpp/include/cudf/lists/filling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -66,8 +67,8 @@ namespace cudf::lists {
 std::unique_ptr<column> sequences(
   column_view const& starts,
   column_view const& sizes,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a lists column in which each row contains a sequence of values specified by a tuple
@@ -108,8 +109,8 @@ std::unique_ptr<column> sequences(
   column_view const& starts,
   column_view const& steps,
   column_view const& sizes,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf::lists
diff --git a/cpp/include/cudf/lists/gather.hpp b/cpp/include/cudf/lists/gather.hpp
index 5e6ab6816e6..a0d79c05098 100644
--- a/cpp/include/cudf/lists/gather.hpp
+++ b/cpp/include/cudf/lists/gather.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -73,9 +74,9 @@ namespace lists {
 std::unique_ptr<column> segmented_gather(
   lists_column_view const& source_column,
   lists_column_view const& gather_map_list,
-  out_of_bounds_policy bounds_policy  = out_of_bounds_policy::DONT_CHECK,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  out_of_bounds_policy bounds_policy = out_of_bounds_policy::DONT_CHECK,
+  rmm::cuda_stream_view stream       = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr  = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/reverse.hpp b/cpp/include/cudf/lists/reverse.hpp
index 864cd796f72..34c40c5a3ba 100644
--- a/cpp/include/cudf/lists/reverse.hpp
+++ b/cpp/include/cudf/lists/reverse.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -48,8 +49,8 @@ namespace cudf::lists {
  */
 std::unique_ptr<column> reverse(
   lists_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 
diff --git a/cpp/include/cudf/lists/set_operations.hpp b/cpp/include/cudf/lists/set_operations.hpp
index 6fb8989f0bb..b8abfd62461 100644
--- a/cpp/include/cudf/lists/set_operations.hpp
+++ b/cpp/include/cudf/lists/set_operations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::lists {
 /**
@@ -59,10 +60,10 @@ namespace cudf::lists {
 std::unique_ptr<column> have_overlap(
   lists_column_view const& lhs,
   lists_column_view const& rhs,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality nulls_equal         = null_equality::EQUAL,
+  nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a lists column of distinct elements common to two input lists columns.
@@ -96,10 +97,10 @@ std::unique_ptr<column> have_overlap(
 std::unique_ptr<column> intersect_distinct(
   lists_column_view const& lhs,
   lists_column_view const& rhs,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality nulls_equal         = null_equality::EQUAL,
+  nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a lists column of distinct elements found in either of two input lists columns.
@@ -133,10 +134,10 @@ std::unique_ptr<column> intersect_distinct(
 std::unique_ptr<column> union_distinct(
   lists_column_view const& lhs,
   lists_column_view const& rhs,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality nulls_equal         = null_equality::EQUAL,
+  nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a lists column of distinct elements found only in the left input column.
@@ -170,10 +171,10 @@ std::unique_ptr<column> union_distinct(
 std::unique_ptr<column> difference_distinct(
   lists_column_view const& lhs,
   lists_column_view const& rhs,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality nulls_equal         = null_equality::EQUAL,
+  nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf::lists
diff --git a/cpp/include/cudf/lists/sorting.hpp b/cpp/include/cudf/lists/sorting.hpp
index 39a52c75a98..78cea191bc5 100644
--- a/cpp/include/cudf/lists/sorting.hpp
+++ b/cpp/include/cudf/lists/sorting.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -55,8 +56,8 @@ std::unique_ptr<column> sort_lists(
   lists_column_view const& source_column,
   order column_order,
   null_order null_precedence,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Segmented sort of the elements within a list in each row of a list column using stable
@@ -68,8 +69,8 @@ std::unique_ptr<column> stable_sort_lists(
   lists_column_view const& source_column,
   order column_order,
   null_order null_precedence,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/stream_compaction.hpp b/cpp/include/cudf/lists/stream_compaction.hpp
index 3ac4f6861ec..31f09d37560 100644
--- a/cpp/include/cudf/lists/stream_compaction.hpp
+++ b/cpp/include/cudf/lists/stream_compaction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::lists {
 
@@ -61,8 +62,8 @@ namespace cudf::lists {
 std::unique_ptr<column> apply_boolean_mask(
   lists_column_view const& input,
   lists_column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a new list column without duplicate elements in each list.
@@ -86,10 +87,10 @@ std::unique_ptr<column> apply_boolean_mask(
  */
 std::unique_ptr<column> distinct(
   lists_column_view const& input,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality nulls_equal         = null_equality::EQUAL,
+  nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/merge.hpp b/cpp/include/cudf/merge.hpp
index 8886ec24bfe..29aa3ffe934 100644
--- a/cpp/include/cudf/merge.hpp
+++ b/cpp/include/cudf/merge.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -105,7 +106,7 @@ std::unique_ptr<cudf::table> merge(
   std::vector<cudf::size_type> const& key_cols,
   std::vector<cudf::order> const& column_order,
   std::vector<cudf::null_order> const& null_precedence = {},
-  rmm::mr::device_memory_resource* mr                  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr                    = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp
index 524296e60ca..9e375df140b 100644
--- a/cpp/include/cudf/null_mask.hpp
+++ b/cpp/include/cudf/null_mask.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include <rmm/device_buffer.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <vector>
 
@@ -89,8 +90,8 @@ size_type num_bitmask_words(size_type number_of_bits);
 rmm::device_buffer create_null_mask(
   size_type size,
   mask_state state,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Sets a pre-allocated bitmask buffer to a given state in the range
@@ -132,8 +133,8 @@ rmm::device_buffer copy_bitmask(
   bitmask_type const* mask,
   size_type begin_bit,
   size_type end_bit,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Copies `view`'s bitmask from the bits
@@ -149,8 +150,8 @@ rmm::device_buffer copy_bitmask(
  */
 rmm::device_buffer copy_bitmask(
   column_view const& view,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs bitwise AND of the bitmasks of columns of a table. Returns
@@ -166,8 +167,8 @@ rmm::device_buffer copy_bitmask(
  */
 std::pair<rmm::device_buffer, size_type> bitmask_and(
   table_view const& view,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs bitwise OR of the bitmasks of columns of a table. Returns
@@ -183,8 +184,8 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(
  */
 std::pair<rmm::device_buffer, size_type> bitmask_or(
   table_view const& view,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Given a validity bitmask, counts the number of null elements (unset bits)
diff --git a/cpp/include/cudf/partitioning.hpp b/cpp/include/cudf/partitioning.hpp
index 7033aa500a2..9ed56297908 100644
--- a/cpp/include/cudf/partitioning.hpp
+++ b/cpp/include/cudf/partitioning.hpp
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -78,7 +79,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> partition(
   table_view const& t,
   column_view const& partition_map,
   size_type num_partitions,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Partitions rows from the input table into multiple output tables.
@@ -104,10 +105,10 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
   table_view const& input,
   std::vector<size_type> const& columns_to_hash,
   int num_partitions,
-  hash_id hash_function               = hash_id::HASH_MURMUR3,
-  uint32_t seed                       = DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  hash_id hash_function             = hash_id::HASH_MURMUR3,
+  uint32_t seed                     = DEFAULT_HASH_SEED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Round-robin partition.
@@ -249,8 +250,8 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
 std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> round_robin_partition(
   table_view const& input,
   cudf::size_type num_partitions,
-  cudf::size_type start_partition     = 0,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  cudf::size_type start_partition   = 0,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/quantiles.hpp b/cpp/include/cudf/quantiles.hpp
index 1f3c26fa077..a1c98ee4e9d 100644
--- a/cpp/include/cudf/quantiles.hpp
+++ b/cpp/include/cudf/quantiles.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 /**
@@ -56,10 +57,10 @@ namespace cudf {
 std::unique_ptr<column> quantile(
   column_view const& input,
   std::vector<double> const& q,
-  interpolation interp                = interpolation::LINEAR,
-  column_view const& ordered_indices  = {},
-  bool exact                          = true,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  interpolation interp               = interpolation::LINEAR,
+  column_view const& ordered_indices = {},
+  bool exact                         = true,
+  rmm::device_async_resource_ref mr  = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the rows of the input corresponding to the requested quantiles.
@@ -98,7 +99,7 @@ std::unique_ptr<table> quantiles(
   cudf::sorted is_input_sorted                   = sorted::NO,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Calculate approximate percentiles on an input tdigest column.
@@ -125,7 +126,7 @@ std::unique_ptr<table> quantiles(
 std::unique_ptr<column> percentile_approx(
   tdigest::tdigest_column_view const& input,
   column_view const& percentiles,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/reduction.hpp b/cpp/include/cudf/reduction.hpp
index 52aebeb55e5..5adf89d1706 100644
--- a/cpp/include/cudf/reduction.hpp
+++ b/cpp/include/cudf/reduction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/scalar/scalar.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
@@ -81,7 +82,7 @@ std::unique_ptr<scalar> reduce(
   column_view const& col,
   reduce_aggregation const& agg,
   data_type output_dtype,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Computes the reduction of the values in all rows of a column with an initial value
@@ -103,7 +104,7 @@ std::unique_ptr<scalar> reduce(
   reduce_aggregation const& agg,
   data_type output_dtype,
   std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Compute reduction of each segment in the input column
@@ -153,7 +154,7 @@ std::unique_ptr<column> segmented_reduce(
   segmented_reduce_aggregation const& agg,
   data_type output_dtype,
   null_policy null_handling,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Compute reduction of each segment in the input column with an initial value. Only SUM,
@@ -178,7 +179,7 @@ std::unique_ptr<column> segmented_reduce(
   data_type output_dtype,
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Computes the scan of a column.
@@ -201,8 +202,8 @@ std::unique_ptr<column> scan(
   column_view const& input,
   scan_aggregation const& agg,
   scan_type inclusive,
-  null_policy null_handling           = null_policy::EXCLUDE,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_policy null_handling         = null_policy::EXCLUDE,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Determines the minimum and maximum values of a column.
@@ -215,7 +216,7 @@ std::unique_ptr<column> scan(
  */
 std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
   column_view const& col,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/reduction/detail/histogram.hpp b/cpp/include/cudf/reduction/detail/histogram.hpp
index 97c711fda4e..f23c5a14e33 100644
--- a/cpp/include/cudf/reduction/detail/histogram.hpp
+++ b/cpp/include/cudf/reduction/detail/histogram.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <optional>
@@ -42,7 +43,7 @@ namespace cudf::reduction::detail {
 compute_row_frequencies(table_view const& input,
                         std::optional<column_view> const& partial_counts,
                         rmm::cuda_stream_view stream,
-                        rmm::mr::device_memory_resource* mr);
+                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create an empty histogram column.
diff --git a/cpp/include/cudf/reduction/detail/reduction.cuh b/cpp/include/cudf/reduction/detail/reduction.cuh
index 9807d4cb4ea..7d1754d86f2 100644
--- a/cpp/include/cudf/reduction/detail/reduction.cuh
+++ b/cpp/include/cudf/reduction/detail/reduction.cuh
@@ -26,6 +26,7 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/device/device_reduce.cuh>
 #include <thrust/for_each.h>
@@ -62,7 +63,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
                                op::simple_op<Op> op,
                                std::optional<OutputType> init,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   auto const binary_op     = cudf::detail::cast_functor<OutputType>(op.get_binary_op());
   auto const initial_value = init.value_or(op.template get_identity<OutputType>());
@@ -105,7 +106,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
                                op::simple_op<Op> op,
                                std::optional<OutputType> init,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FAIL(
     "This function should never be called. fixed_point reduce should always go through the reduce "
@@ -122,7 +123,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
                                op::simple_op<Op> op,
                                std::optional<OutputType> init,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   auto const binary_op     = cudf::detail::cast_functor<OutputType>(op.get_binary_op());
   auto const initial_value = init.value_or(op.template get_identity<OutputType>());
@@ -188,7 +189,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
                                cudf::size_type valid_count,
                                cudf::size_type ddof,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   auto const binary_op     = cudf::detail::cast_functor<IntermediateType>(op.get_binary_op());
   auto const initial_value = op.template get_identity<IntermediateType>();
diff --git a/cpp/include/cudf/reduction/detail/reduction.hpp b/cpp/include/cudf/reduction/detail/reduction.hpp
index 4cbfb82ae6b..78f90a1e2c9 100644
--- a/cpp/include/cudf/reduction/detail/reduction.hpp
+++ b/cpp/include/cudf/reduction/detail/reduction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,13 +20,15 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <optional>
 
 namespace cudf::reduction::detail {
 
 /**
  * @copydoc cudf::reduce(column_view const&, reduce_aggregation const&, data_type,
- * std::optional<std::reference_wrapper<scalar const>>, rmm::mr::device_memory_resource*)
+ * std::optional<std::reference_wrapper<scalar const>>, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -35,6 +37,6 @@ std::unique_ptr<scalar> reduce(column_view const& col,
                                data_type output_dtype,
                                std::optional<std::reference_wrapper<scalar const>> init,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::reduction::detail
diff --git a/cpp/include/cudf/reduction/detail/reduction_functions.hpp b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
index 704332c8e1d..31d465619b9 100644
--- a/cpp/include/cudf/reduction/detail/reduction_functions.hpp
+++ b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
@@ -47,7 +48,7 @@ std::unique_ptr<scalar> sum(column_view const& col,
                             data_type const output_dtype,
                             std::optional<std::reference_wrapper<scalar const>> init,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes minimum of elements in input column
@@ -67,7 +68,7 @@ std::unique_ptr<scalar> min(column_view const& col,
                             data_type const output_dtype,
                             std::optional<std::reference_wrapper<scalar const>> init,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes maximum of elements in input column
@@ -87,7 +88,7 @@ std::unique_ptr<scalar> max(column_view const& col,
                             data_type const output_dtype,
                             std::optional<std::reference_wrapper<scalar const>> init,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes any of elements in input column is true when typecasted to bool
@@ -108,7 +109,7 @@ std::unique_ptr<scalar> any(column_view const& col,
                             data_type const output_dtype,
                             std::optional<std::reference_wrapper<scalar const>> init,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes all of elements in input column is true when typecasted to bool
@@ -129,7 +130,7 @@ std::unique_ptr<scalar> all(column_view const& col,
                             data_type const output_dtype,
                             std::optional<std::reference_wrapper<scalar const>> init,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Compute frequency for each unique element in the input column.
@@ -144,7 +145,7 @@ std::unique_ptr<scalar> all(column_view const& col,
  */
 std::unique_ptr<scalar> histogram(column_view const& input,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Merge multiple histograms together.
@@ -156,7 +157,7 @@ std::unique_ptr<scalar> histogram(column_view const& input,
  */
 std::unique_ptr<scalar> merge_histogram(column_view const& input,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes product of elements in input column
@@ -177,7 +178,7 @@ std::unique_ptr<scalar> product(column_view const& col,
                                 data_type const output_dtype,
                                 std::optional<std::reference_wrapper<scalar const>> init,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr);
+                                rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes sum of squares of elements in input column
@@ -196,7 +197,7 @@ std::unique_ptr<scalar> product(column_view const& col,
 std::unique_ptr<scalar> sum_of_squares(column_view const& col,
                                        data_type const output_dtype,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes mean of elements in input column
@@ -215,7 +216,7 @@ std::unique_ptr<scalar> sum_of_squares(column_view const& col,
 std::unique_ptr<scalar> mean(column_view const& col,
                              data_type const output_dtype,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr);
+                             rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes variance of elements in input column
@@ -237,7 +238,7 @@ std::unique_ptr<scalar> variance(column_view const& col,
                                  data_type const output_dtype,
                                  size_type ddof,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes standard deviation of elements in input column
@@ -259,7 +260,7 @@ std::unique_ptr<scalar> standard_deviation(column_view const& col,
                                            data_type const output_dtype,
                                            size_type ddof,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
  * @brief Returns nth element in input column
@@ -289,7 +290,7 @@ std::unique_ptr<scalar> nth_element(column_view const& col,
                                     size_type n,
                                     null_policy null_handling,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @brief Collect input column into a (list) scalar
@@ -303,7 +304,7 @@ std::unique_ptr<scalar> nth_element(column_view const& col,
 std::unique_ptr<scalar> collect_list(column_view const& col,
                                      null_policy null_handling,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @brief Merge a bunch of list scalars into single list scalar
@@ -315,7 +316,7 @@ std::unique_ptr<scalar> collect_list(column_view const& col,
  */
 std::unique_ptr<scalar> merge_lists(lists_column_view const& col,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @brief Collect input column into a (list) scalar without duplicated elements
@@ -333,7 +334,7 @@ std::unique_ptr<scalar> collect_set(column_view const& col,
                                     null_equality nulls_equal,
                                     nan_equality nans_equal,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @brief Merge a bunch of list scalars into single list scalar then drop duplicated elements
@@ -349,7 +350,7 @@ std::unique_ptr<scalar> merge_sets(lists_column_view const& col,
                                    null_equality nulls_equal,
                                    nan_equality nans_equal,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace reduction
diff --git a/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp b/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
index 3902a7200a9..770ac6580ef 100644
--- a/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
+++ b/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
@@ -57,7 +58,7 @@ std::unique_ptr<column> segmented_sum(column_view const& col,
                                       null_policy null_handling,
                                       std::optional<std::reference_wrapper<scalar const>> init,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes product of each segment in the input column
@@ -87,7 +88,7 @@ std::unique_ptr<column> segmented_product(column_view const& col,
                                           null_policy null_handling,
                                           std::optional<std::reference_wrapper<scalar const>> init,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 /**
  * @brief Compute minimum of each segment in the input column
@@ -116,7 +117,7 @@ std::unique_ptr<column> segmented_min(column_view const& col,
                                       null_policy null_handling,
                                       std::optional<std::reference_wrapper<scalar const>> init,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Compute maximum of each segment in the input column
@@ -145,7 +146,7 @@ std::unique_ptr<column> segmented_max(column_view const& col,
                                       null_policy null_handling,
                                       std::optional<std::reference_wrapper<scalar const>> init,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Compute if any of the values in the segment are true when typecasted to bool
@@ -175,7 +176,7 @@ std::unique_ptr<column> segmented_any(column_view const& col,
                                       null_policy null_handling,
                                       std::optional<std::reference_wrapper<scalar const>> init,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Compute if all of the values in the segment are true when typecasted to bool
@@ -205,7 +206,7 @@ std::unique_ptr<column> segmented_all(column_view const& col,
                                       null_policy null_handling,
                                       std::optional<std::reference_wrapper<scalar const>> init,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes mean of elements of segments in the input column
@@ -233,7 +234,7 @@ std::unique_ptr<column> segmented_mean(column_view const& col,
                                        data_type const output_dtype,
                                        null_policy null_handling,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes sum of squares of elements of segments in the input column
@@ -261,7 +262,7 @@ std::unique_ptr<column> segmented_sum_of_squares(column_view const& col,
                                                  data_type const output_dtype,
                                                  null_policy null_handling,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr);
+                                                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes the standard deviation of elements of segments in the input column
@@ -292,7 +293,7 @@ std::unique_ptr<column> segmented_standard_deviation(column_view const& col,
                                                      null_policy null_handling,
                                                      size_type ddof,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr);
+                                                     rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes the variance of elements of segments in the input column
@@ -323,7 +324,7 @@ std::unique_ptr<column> segmented_variance(column_view const& col,
                                            null_policy null_handling,
                                            size_type ddof,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
  * @brief Counts the number of unique values within each segment of a column
@@ -351,7 +352,7 @@ std::unique_ptr<column> segmented_nunique(column_view const& col,
                                           device_span<size_type const> offsets,
                                           null_policy null_handling,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace reduction
diff --git a/cpp/include/cudf/replace.hpp b/cpp/include/cudf/replace.hpp
index 3405dc8b796..ae20e72f023 100644
--- a/cpp/include/cudf/replace.hpp
+++ b/cpp/include/cudf/replace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -55,8 +56,8 @@ enum class replace_policy : bool { PRECEDING, FOLLOWING };
 std::unique_ptr<column> replace_nulls(
   column_view const& input,
   column_view const& replacement,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Replaces all null values in a column with a scalar.
@@ -74,8 +75,8 @@ std::unique_ptr<column> replace_nulls(
 std::unique_ptr<column> replace_nulls(
   column_view const& input,
   scalar const& replacement,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Replaces all null values in a column with the first non-null value that precedes/follows.
@@ -93,8 +94,8 @@ std::unique_ptr<column> replace_nulls(
 std::unique_ptr<column> replace_nulls(
   column_view const& input,
   replace_policy const& replace_policy,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Replaces all NaN values in a column with corresponding values from another column
@@ -121,8 +122,8 @@ std::unique_ptr<column> replace_nulls(
 std::unique_ptr<column> replace_nans(
   column_view const& input,
   column_view const& replacement,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Replaces all NaN values in a column with a scalar
@@ -148,8 +149,8 @@ std::unique_ptr<column> replace_nans(
 std::unique_ptr<column> replace_nans(
   column_view const& input,
   scalar const& replacement,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Return a copy of `input_col` replacing any `values_to_replace[i]`
@@ -167,8 +168,8 @@ std::unique_ptr<column> find_and_replace_all(
   column_view const& input_col,
   column_view const& values_to_replace,
   column_view const& replacement_values,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Replaces values less than `lo` in `input` with `lo_replace`,
@@ -222,8 +223,8 @@ std::unique_ptr<column> clamp(
   scalar const& lo_replace,
   scalar const& hi,
   scalar const& hi_replace,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Replaces values less than `lo` in `input` with `lo`,
@@ -268,8 +269,8 @@ std::unique_ptr<column> clamp(
   column_view const& input,
   scalar const& lo,
   scalar const& hi,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Copies from a column of floating-point elements and replaces `-NaN` and `-0.0` with `+NaN`
@@ -288,8 +289,8 @@ std::unique_ptr<column> clamp(
  */
 std::unique_ptr<column> normalize_nans_and_zeros(
   column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Modifies a column of floating-point elements to replace all `-NaN` and `-0.0` with `+NaN`
diff --git a/cpp/include/cudf/reshape.hpp b/cpp/include/cudf/reshape.hpp
index 42cfb890a31..26316be7fd4 100644
--- a/cpp/include/cudf/reshape.hpp
+++ b/cpp/include/cudf/reshape.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -52,7 +53,7 @@ namespace cudf {
  */
 std::unique_ptr<column> interleave_columns(
   table_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Repeats the rows from `input` table `count` times to form a new table.
@@ -75,7 +76,7 @@ std::unique_ptr<column> interleave_columns(
 std::unique_ptr<table> tile(
   table_view const& input,
   size_type count,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Configures whether byte casting flips endianness
@@ -100,7 +101,7 @@ enum class flip_endianness : bool { NO, YES };
 std::unique_ptr<column> byte_cast(
   column_view const& input_column,
   flip_endianness endian_configuration,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/rolling.hpp b/cpp/include/cudf/rolling.hpp
index ec93c709163..2cd34f48265 100644
--- a/cpp/include/cudf/rolling.hpp
+++ b/cpp/include/cudf/rolling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -66,7 +67,7 @@ std::unique_ptr<column> rolling_window(
   size_type following_window,
   size_type min_periods,
   rolling_aggregation const& agg,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  @copybrief rolling_window
@@ -76,7 +77,7 @@ std::unique_ptr<column> rolling_window(
  *            size_type following_window,
  *            size_type min_periods,
  *            rolling_aggregation const& agg,
- *            rmm::mr::device_memory_resource* mr)
+ *            rmm::device_async_resource_ref mr)
  *
  * @param default_outputs A column of per-row default values to be returned instead
  *                        of nulls. Used for LEAD()/LAG(), if the row offset crosses
@@ -89,7 +90,7 @@ std::unique_ptr<column> rolling_window(
   size_type following_window,
   size_type min_periods,
   rolling_aggregation const& agg,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Abstraction for window boundary sizes
@@ -237,7 +238,7 @@ std::unique_ptr<column> grouped_rolling_window(
   size_type following_window,
   size_type min_periods,
   rolling_aggregation const& aggr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  @copybrief grouped_rolling_window
@@ -248,7 +249,7 @@ std::unique_ptr<column> grouped_rolling_window(
  *            size_type following_window,
  *            size_type min_periods,
  *            rolling_aggregation const& aggr,
- *            rmm::mr::device_memory_resource* mr)
+ *            rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<column> grouped_rolling_window(
   table_view const& group_keys,
@@ -257,7 +258,7 @@ std::unique_ptr<column> grouped_rolling_window(
   window_bounds following_window,
   size_type min_periods,
   rolling_aggregation const& aggr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  @copybrief grouped_rolling_window
@@ -268,7 +269,7 @@ std::unique_ptr<column> grouped_rolling_window(
  *            size_type following_window,
  *            size_type min_periods,
  *            rolling_aggregation const& aggr,
- *            rmm::mr::device_memory_resource* mr)
+ *            rmm::device_async_resource_ref mr)
  *
  * @param default_outputs A column of per-row default values to be returned instead
  *                        of nulls. Used for LEAD()/LAG(), if the row offset crosses
@@ -282,7 +283,7 @@ std::unique_ptr<column> grouped_rolling_window(
   size_type following_window,
   size_type min_periods,
   rolling_aggregation const& aggr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  @copybrief grouped_rolling_window
@@ -294,7 +295,7 @@ std::unique_ptr<column> grouped_rolling_window(
  *            size_type following_window,
  *            size_type min_periods,
  *            rolling_aggregation const& aggr,
- *            rmm::mr::device_memory_resource* mr)
+ *            rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<column> grouped_rolling_window(
   table_view const& group_keys,
@@ -304,7 +305,7 @@ std::unique_ptr<column> grouped_rolling_window(
   window_bounds following_window,
   size_type min_periods,
   rolling_aggregation const& aggr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Applies a grouping-aware, timestamp-based rolling window function to the values in a
@@ -399,7 +400,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(
   size_type following_window_in_days,
   size_type min_periods,
   rolling_aggregation const& aggr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Applies a grouping-aware, timestamp-based rolling window function to the values in a
@@ -414,7 +415,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(
  *                size_type following_window_in_days,
  *                size_type min_periods,
  *                rolling_aggregation const& aggr,
- *                rmm::mr::device_memory_resource* mr)
+ *                rmm::device_async_resource_ref mr)
  *
  * The `preceding_window_in_days` and `following_window_in_days` are specified as a `window_bounds`
  * and supports "unbounded" windows, if set to `window_bounds::unbounded()`.
@@ -428,7 +429,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(
   window_bounds following_window_in_days,
   size_type min_periods,
   rolling_aggregation const& aggr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Applies a grouping-aware, value range-based rolling window function to the values in a
@@ -548,7 +549,7 @@ std::unique_ptr<column> grouped_range_rolling_window(
   range_window_bounds const& following,
   size_type min_periods,
   rolling_aggregation const& aggr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Applies a variable-size rolling window function to the values in a column.
@@ -591,7 +592,7 @@ std::unique_ptr<column> rolling_window(
   column_view const& following_window,
   size_type min_periods,
   rolling_aggregation const& agg,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/round.hpp b/cpp/include/cudf/round.hpp
index ee088628b94..85935f8f05c 100644
--- a/cpp/include/cudf/round.hpp
+++ b/cpp/include/cudf/round.hpp
@@ -19,6 +19,7 @@
 #include <cudf/column/column.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -72,9 +73,9 @@ enum class rounding_method : int32_t { HALF_UP, HALF_EVEN };
  */
 std::unique_ptr<column> round(
   column_view const& input,
-  int32_t decimal_places              = 0,
-  rounding_method method              = rounding_method::HALF_UP,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  int32_t decimal_places            = 0,
+  rounding_method method            = rounding_method::HALF_UP,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index 08bffab5067..da1d0d743a7 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 /**
  * @file
@@ -112,8 +113,8 @@ class scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   scalar(scalar const& other,
-         rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-         rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+         rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+         rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new scalar object.
@@ -127,9 +128,9 @@ class scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   scalar(data_type type,
-         bool is_valid                       = false,
-         rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-         rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+         bool is_valid                     = false,
+         rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+         rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 };
 
 namespace detail {
@@ -164,8 +165,8 @@ class fixed_width_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_width_scalar(fixed_width_scalar const& other,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Set the value of the scalar.
@@ -214,9 +215,9 @@ class fixed_width_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_width_scalar(T value,
-                     bool is_valid                       = true,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                     bool is_valid                     = true,
+                     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new fixed width scalar object from existing device memory.
@@ -227,9 +228,9 @@ class fixed_width_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_width_scalar(rmm::device_scalar<T>&& data,
-                     bool is_valid                       = true,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                     bool is_valid                     = true,
+                     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 };
 
 }  // namespace detail
@@ -264,8 +265,8 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   numeric_scalar(numeric_scalar const& other,
-                 rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new numeric scalar object.
@@ -276,9 +277,9 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   numeric_scalar(T value,
-                 bool is_valid                       = true,
-                 rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                 bool is_valid                     = true,
+                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new numeric scalar object from existing device memory.
@@ -289,9 +290,9 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   numeric_scalar(rmm::device_scalar<T>&& data,
-                 bool is_valid                       = true,
-                 rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                 bool is_valid                     = true,
+                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 };
 
 /**
@@ -327,8 +328,8 @@ class fixed_point_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_point_scalar(fixed_point_scalar const& other,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new fixed_point scalar object from already shifted value and scale.
@@ -341,9 +342,9 @@ class fixed_point_scalar : public scalar {
    */
   fixed_point_scalar(rep_type value,
                      numeric::scale_type scale,
-                     bool is_valid                       = true,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                     bool is_valid                     = true,
+                     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new fixed_point scalar object from a value and default 0-scale.
@@ -354,9 +355,9 @@ class fixed_point_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_point_scalar(rep_type value,
-                     bool is_valid                       = true,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                     bool is_valid                     = true,
+                     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new fixed_point scalar object from a fixed_point number.
@@ -367,9 +368,9 @@ class fixed_point_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_point_scalar(T value,
-                     bool is_valid                       = true,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                     bool is_valid                     = true,
+                     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new fixed_point scalar object from existing device memory.
@@ -382,9 +383,9 @@ class fixed_point_scalar : public scalar {
    */
   fixed_point_scalar(rmm::device_scalar<rep_type>&& data,
                      numeric::scale_type scale,
-                     bool is_valid                       = true,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                     bool is_valid                     = true,
+                     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Get the value of the scalar.
@@ -451,8 +452,8 @@ class string_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   string_scalar(string_scalar const& other,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new string scalar object.
@@ -465,9 +466,9 @@ class string_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   string_scalar(std::string const& string,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new string scalar object from string_view.
@@ -480,9 +481,9 @@ class string_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   string_scalar(value_type const& source,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new string scalar object from string_view in device memory.
@@ -495,9 +496,9 @@ class string_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   string_scalar(rmm::device_scalar<value_type>& data,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new string scalar object by moving an existing string data buffer.
@@ -511,9 +512,9 @@ class string_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   string_scalar(rmm::device_buffer&& data,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Explicit conversion operator to get the value of the scalar in a host std::string.
@@ -584,8 +585,8 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   chrono_scalar(chrono_scalar const& other,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new chrono scalar object.
@@ -596,9 +597,9 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   chrono_scalar(T value,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new chrono scalar object from existing device memory.
@@ -609,9 +610,9 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   chrono_scalar(rmm::device_scalar<T>&& data,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 };
 
 /**
@@ -643,8 +644,8 @@ class timestamp_scalar : public chrono_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   timestamp_scalar(timestamp_scalar const& other,
-                   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new timestamp scalar object from a duration that is
@@ -659,8 +660,8 @@ class timestamp_scalar : public chrono_scalar<T> {
   template <typename Duration2>
   timestamp_scalar(Duration2 const& value,
                    bool is_valid,
-                   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Returns the duration in number of ticks since the UNIX epoch.
@@ -699,8 +700,8 @@ class duration_scalar : public chrono_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   duration_scalar(duration_scalar const& other,
-                  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new duration scalar object from tick counts.
@@ -712,8 +713,8 @@ class duration_scalar : public chrono_scalar<T> {
    */
   duration_scalar(rep_type value,
                   bool is_valid,
-                  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Returns the duration in number of ticks.
@@ -748,8 +749,8 @@ class list_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   list_scalar(list_scalar const& other,
-              rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-              rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+              rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+              rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new list scalar object from column_view.
@@ -762,9 +763,9 @@ class list_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   list_scalar(cudf::column_view const& data,
-              bool is_valid                       = true,
-              rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-              rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+              bool is_valid                     = true,
+              rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+              rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new list scalar object from existing column.
@@ -775,9 +776,9 @@ class list_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   list_scalar(cudf::column&& data,
-              bool is_valid                       = true,
-              rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-              rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+              bool is_valid                     = true,
+              rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+              rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Returns a non-owning, immutable view to underlying device data.
@@ -813,8 +814,8 @@ class struct_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   struct_scalar(struct_scalar const& other,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new struct scalar object from table_view.
@@ -827,9 +828,9 @@ class struct_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   struct_scalar(table_view const& data,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new struct scalar object from a host_span of column_views.
@@ -842,9 +843,9 @@ class struct_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   struct_scalar(host_span<column_view const> data,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new struct scalar object from an existing table in device memory.
@@ -858,9 +859,9 @@ class struct_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   struct_scalar(table&& data,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Returns a non-owning, immutable view to underlying device data.
@@ -888,7 +889,7 @@ class struct_scalar : public scalar {
   static table init_data(table&& data,
                          bool is_valid,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr);
+                         rmm::device_async_resource_ref mr);
 };
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/scalar/scalar_factories.hpp b/cpp/include/cudf/scalar/scalar_factories.hpp
index 78b6c4fd0e9..7dd4674a2fd 100644
--- a/cpp/include/cudf/scalar/scalar_factories.hpp
+++ b/cpp/include/cudf/scalar/scalar_factories.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 /**
@@ -43,8 +44,8 @@ namespace cudf {
  */
 std::unique_ptr<scalar> make_numeric_scalar(
   data_type type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct scalar with uninitialized storage to hold a value of the
@@ -60,8 +61,8 @@ std::unique_ptr<scalar> make_numeric_scalar(
  */
 std::unique_ptr<scalar> make_timestamp_scalar(
   data_type type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct scalar with uninitialized storage to hold a value of the
@@ -77,8 +78,8 @@ std::unique_ptr<scalar> make_timestamp_scalar(
  */
 std::unique_ptr<scalar> make_duration_scalar(
   data_type type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct scalar with uninitialized storage to hold a value of the
@@ -94,8 +95,8 @@ std::unique_ptr<scalar> make_duration_scalar(
  */
 std::unique_ptr<scalar> make_fixed_width_scalar(
   data_type type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct STRING type scalar given a `std::string`.
@@ -111,8 +112,8 @@ std::unique_ptr<scalar> make_fixed_width_scalar(
  */
 std::unique_ptr<scalar> make_string_scalar(
   std::string const& string,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Constructs default constructed scalar of type `type`
@@ -126,8 +127,8 @@ std::unique_ptr<scalar> make_string_scalar(
  */
 std::unique_ptr<scalar> make_default_constructed_scalar(
   data_type type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates an empty (invalid) scalar of the same type as the `input` column_view.
@@ -141,8 +142,8 @@ std::unique_ptr<scalar> make_default_constructed_scalar(
  */
 std::unique_ptr<scalar> make_empty_scalar_like(
   column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct scalar using the given value of fixed width type
@@ -156,8 +157,8 @@ std::unique_ptr<scalar> make_empty_scalar_like(
 template <typename T>
 std::unique_ptr<scalar> make_fixed_width_scalar(
   T value,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   return std::make_unique<scalar_type_t<T>>(value, true, stream, mr);
 }
@@ -176,8 +177,8 @@ template <typename T>
 std::unique_ptr<scalar> make_fixed_point_scalar(
   typename T::rep value,
   numeric::scale_type scale,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   return std::make_unique<scalar_type_t<T>>(value, scale, true, stream, mr);
 }
@@ -192,8 +193,8 @@ std::unique_ptr<scalar> make_fixed_point_scalar(
  */
 std::unique_ptr<scalar> make_list_scalar(
   column_view elements,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a struct scalar using the given table_view.
@@ -207,8 +208,8 @@ std::unique_ptr<scalar> make_list_scalar(
  */
 std::unique_ptr<scalar> make_struct_scalar(
   table_view const& data,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a struct scalar using the given span of column views.
@@ -222,8 +223,8 @@ std::unique_ptr<scalar> make_struct_scalar(
  */
 std::unique_ptr<scalar> make_struct_scalar(
   host_span<column_view const> data,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/search.hpp b/cpp/include/cudf/search.hpp
index 49acce6a63b..2e50ba2d687 100644
--- a/cpp/include/cudf/search.hpp
+++ b/cpp/include/cudf/search.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <vector>
 
@@ -72,8 +73,8 @@ std::unique_ptr<column> lower_bound(
   table_view const& needles,
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Find largest indices in a sorted table where values should be inserted to maintain order.
@@ -114,8 +115,8 @@ std::unique_ptr<column> upper_bound(
   table_view const& needles,
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Check if the given `needle` value exists in the `haystack` column.
@@ -163,8 +164,8 @@ bool contains(column_view const& haystack,
 std::unique_ptr<column> contains(
   column_view const& haystack,
   column_view const& needles,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
index 42bcb5da8e3..79a00cbce42 100644
--- a/cpp/include/cudf/sorting.hpp
+++ b/cpp/include/cudf/sorting.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -54,7 +55,7 @@ std::unique_ptr<column> sorted_order(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the row indices that would produce `input` in a stable
@@ -69,7 +70,7 @@ std::unique_ptr<column> stable_sorted_order(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Checks whether the rows of a `table` are sorted in a lexicographical
@@ -113,7 +114,7 @@ std::unique_ptr<table> sort(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a stable lexicographic sort of the rows of a table
@@ -125,7 +126,7 @@ std::unique_ptr<table> stable_sort(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a key-value sort.
@@ -155,7 +156,7 @@ std::unique_ptr<table> sort_by_key(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a key-value stable sort.
@@ -168,7 +169,7 @@ std::unique_ptr<table> stable_sort_by_key(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the ranks of input column in sorted order.
@@ -207,8 +208,8 @@ std::unique_ptr<column> rank(
   null_policy null_handling,
   null_order null_precedence,
   bool percentage,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns sorted order after sorting each segment in the table.
@@ -259,7 +260,7 @@ std::unique_ptr<column> segmented_sorted_order(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns sorted order after stably sorting each segment in the table.
@@ -272,7 +273,7 @@ std::unique_ptr<column> stable_segmented_sorted_order(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a lexicographic segmented sort of a table
@@ -328,7 +329,7 @@ std::unique_ptr<table> segmented_sort_by_key(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a stably lexicographic segmented sort of a table
@@ -342,7 +343,7 @@ std::unique_ptr<table> stable_segmented_sort_by_key(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp
index 3e7bdf13707..c386b3a22b4 100644
--- a/cpp/include/cudf/stream_compaction.hpp
+++ b/cpp/include/cudf/stream_compaction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -73,7 +74,7 @@ std::unique_ptr<table> drop_nulls(
   table_view const& input,
   std::vector<size_type> const& keys,
   cudf::size_type keep_threshold,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Filters a table to remove null elements.
@@ -104,7 +105,7 @@ std::unique_ptr<table> drop_nulls(
 std::unique_ptr<table> drop_nulls(
   table_view const& input,
   std::vector<size_type> const& keys,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Filters a table to remove NANs with threshold count.
@@ -147,7 +148,7 @@ std::unique_ptr<table> drop_nans(
   table_view const& input,
   std::vector<size_type> const& keys,
   cudf::size_type keep_threshold,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Filters a table to remove NANs.
@@ -179,7 +180,7 @@ std::unique_ptr<table> drop_nans(
 std::unique_ptr<table> drop_nans(
   table_view const& input,
   std::vector<size_type> const& keys,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Filters `input` using `boolean_mask` of boolean values as a mask.
@@ -205,7 +206,7 @@ std::unique_ptr<table> drop_nans(
 std::unique_ptr<table> apply_boolean_mask(
   table_view const& input,
   column_view const& boolean_mask,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Choices for drop_duplicates API for retainment of duplicate rows
@@ -248,8 +249,8 @@ std::unique_ptr<table> unique(
   table_view const& input,
   std::vector<size_type> const& keys,
   duplicate_keep_option keep,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality nulls_equal         = null_equality::EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a new table without duplicate rows.
@@ -273,10 +274,10 @@ std::unique_ptr<table> unique(
 std::unique_ptr<table> distinct(
   table_view const& input,
   std::vector<size_type> const& keys,
-  duplicate_keep_option keep          = duplicate_keep_option::KEEP_ANY,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  duplicate_keep_option keep        = duplicate_keep_option::KEEP_ANY,
+  null_equality nulls_equal         = null_equality::EQUAL,
+  nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a column of indices of all distinct rows in the input table.
@@ -294,11 +295,11 @@ std::unique_ptr<table> distinct(
  */
 std::unique_ptr<column> distinct_indices(
   table_view const& input,
-  duplicate_keep_option keep          = duplicate_keep_option::KEEP_ANY,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  duplicate_keep_option keep        = duplicate_keep_option::KEEP_ANY,
+  null_equality nulls_equal         = null_equality::EQUAL,
+  nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a new table without duplicate rows, preserving input order.
@@ -325,10 +326,10 @@ std::unique_ptr<column> distinct_indices(
 std::unique_ptr<table> stable_distinct(
   table_view const& input,
   std::vector<size_type> const& keys,
-  duplicate_keep_option keep          = duplicate_keep_option::KEEP_ANY,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  duplicate_keep_option keep        = duplicate_keep_option::KEEP_ANY,
+  null_equality nulls_equal         = null_equality::EQUAL,
+  nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Count the number of consecutive groups of equivalent rows in a column.
diff --git a/cpp/include/cudf/strings/attributes.hpp b/cpp/include/cudf/strings/attributes.hpp
index 85086e44a26..26f906b3102 100644
--- a/cpp/include/cudf/strings/attributes.hpp
+++ b/cpp/include/cudf/strings/attributes.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -47,7 +48,7 @@ namespace strings {
  */
 std::unique_ptr<column> count_characters(
   strings_column_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column containing byte lengths
@@ -65,7 +66,7 @@ std::unique_ptr<column> count_characters(
  */
 std::unique_ptr<column> count_bytes(
   strings_column_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a numeric column with code point values (integers) for each
@@ -85,7 +86,7 @@ std::unique_ptr<column> count_bytes(
  */
 std::unique_ptr<column> code_points(
   strings_column_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of strings_apis group
 
diff --git a/cpp/include/cudf/strings/capitalize.hpp b/cpp/include/cudf/strings/capitalize.hpp
index 57375e9ac6a..f8cbdc09748 100644
--- a/cpp/include/cudf/strings/capitalize.hpp
+++ b/cpp/include/cudf/strings/capitalize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -60,9 +61,9 @@ namespace strings {
  */
 std::unique_ptr<column> capitalize(
   strings_column_view const& input,
-  string_scalar const& delimiters     = string_scalar("", true, cudf::get_default_stream()),
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& delimiters   = string_scalar("", true, cudf::get_default_stream()),
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Modifies first character of each word to upper-case and lower-cases the rest.
@@ -95,7 +96,7 @@ std::unique_ptr<column> title(
   strings_column_view const& input,
   string_character_types sequence_type = string_character_types::ALPHA,
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Checks if the strings in the input column are title formatted.
@@ -123,8 +124,8 @@ std::unique_ptr<column> title(
  */
 std::unique_ptr<column> is_title(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/case.hpp b/cpp/include/cudf/strings/case.hpp
index 94191686a92..5403fa8db7e 100644
--- a/cpp/include/cudf/strings/case.hpp
+++ b/cpp/include/cudf/strings/case.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -44,8 +45,8 @@ namespace strings {
  */
 std::unique_ptr<column> to_lower(
   strings_column_view const& strings,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Converts a column of strings to upper case.
@@ -63,8 +64,8 @@ std::unique_ptr<column> to_lower(
  */
 std::unique_ptr<column> to_upper(
   strings_column_view const& strings,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of strings converting lower case characters to
@@ -83,8 +84,8 @@ std::unique_ptr<column> to_upper(
  */
 std::unique_ptr<column> swapcase(
   strings_column_view const& strings,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp
index c6db5dab08a..da7a238a400 100644
--- a/cpp/include/cudf/strings/char_types/char_types.hpp
+++ b/cpp/include/cudf/strings/char_types/char_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -67,7 +68,7 @@ std::unique_ptr<column> all_characters_of_type(
   string_character_types types,
   string_character_types verify_types = string_character_types::ALL_TYPES,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Filter specific character types from a column of strings.
@@ -114,7 +115,7 @@ std::unique_ptr<column> filter_characters_of_type(
   string_scalar const& replacement     = string_scalar(""),
   string_character_types types_to_keep = string_character_types::ALL_TYPES,
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/combine.hpp b/cpp/include/cudf/strings/combine.hpp
index 568e8ac50ec..8cc735831b8 100644
--- a/cpp/include/cudf/strings/combine.hpp
+++ b/cpp/include/cudf/strings/combine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -77,10 +78,10 @@ enum class output_if_empty_list {
  */
 std::unique_ptr<column> join_strings(
   strings_column_view const& input,
-  string_scalar const& separator      = string_scalar(""),
-  string_scalar const& narep          = string_scalar("", false),
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& separator    = string_scalar(""),
+  string_scalar const& narep        = string_scalar("", false),
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Concatenates a list of strings columns using separators for each row
@@ -148,7 +149,7 @@ std::unique_ptr<column> concatenate(
   string_scalar const& col_narep       = string_scalar("", false),
   separator_on_nulls separate_nulls    = separator_on_nulls::YES,
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Row-wise concatenates the given list of strings columns and
@@ -199,11 +200,11 @@ std::unique_ptr<column> concatenate(
  */
 std::unique_ptr<column> concatenate(
   table_view const& strings_columns,
-  string_scalar const& separator      = string_scalar(""),
-  string_scalar const& narep          = string_scalar("", false),
-  separator_on_nulls separate_nulls   = separator_on_nulls::YES,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& separator    = string_scalar(""),
+  string_scalar const& narep        = string_scalar("", false),
+  separator_on_nulls separate_nulls = separator_on_nulls::YES,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Given a lists column of strings (each row is a list of strings), concatenates the strings
@@ -270,7 +271,7 @@ std::unique_ptr<column> join_list_elements(
   separator_on_nulls separate_nulls      = separator_on_nulls::YES,
   output_if_empty_list empty_list_policy = output_if_empty_list::EMPTY_STRING,
   rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Given a lists column of strings (each row is a list of strings), concatenates the strings
@@ -329,7 +330,7 @@ std::unique_ptr<column> join_list_elements(
   separator_on_nulls separate_nulls      = separator_on_nulls::YES,
   output_if_empty_list empty_list_policy = output_if_empty_list::EMPTY_STRING,
   rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/contains.hpp b/cpp/include/cudf/strings/contains.hpp
index 341c146df92..f79a0f19e9c 100644
--- a/cpp/include/cudf/strings/contains.hpp
+++ b/cpp/include/cudf/strings/contains.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -59,8 +60,8 @@ struct regex_program;
 std::unique_ptr<column> contains_re(
   strings_column_view const& input,
   regex_program const& prog,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying rows which
@@ -87,8 +88,8 @@ std::unique_ptr<column> contains_re(
 std::unique_ptr<column> matches_re(
   strings_column_view const& input,
   regex_program const& prog,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the number of times the given regex_program's pattern
@@ -115,8 +116,8 @@ std::unique_ptr<column> matches_re(
 std::unique_ptr<column> count_re(
   strings_column_view const& input,
   regex_program const& prog,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying rows which
@@ -163,7 +164,7 @@ std::unique_ptr<column> like(
   string_scalar const& pattern,
   string_scalar const& escape_character = string_scalar(""),
   rmm::cuda_stream_view stream          = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr   = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr     = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying rows which
@@ -204,7 +205,7 @@ std::unique_ptr<column> like(
   strings_column_view const& patterns,
   string_scalar const& escape_character = string_scalar(""),
   rmm::cuda_stream_view stream          = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr   = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr     = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_booleans.hpp b/cpp/include/cudf/strings/convert/convert_booleans.hpp
index 9e9f25e800a..9c922361914 100644
--- a/cpp/include/cudf/strings/convert/convert_booleans.hpp
+++ b/cpp/include/cudf/strings/convert/convert_booleans.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -44,8 +45,8 @@ namespace strings {
 std::unique_ptr<column> to_booleans(
   strings_column_view const& input,
   string_scalar const& true_string,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new strings column converting the boolean values from the
@@ -66,8 +67,8 @@ std::unique_ptr<column> from_booleans(
   column_view const& booleans,
   string_scalar const& true_string,
   string_scalar const& false_string,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_datetime.hpp b/cpp/include/cudf/strings/convert/convert_datetime.hpp
index 81cce14b53b..b89384d718b 100644
--- a/cpp/include/cudf/strings/convert/convert_datetime.hpp
+++ b/cpp/include/cudf/strings/convert/convert_datetime.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <string>
 #include <vector>
@@ -88,8 +89,8 @@ std::unique_ptr<column> to_timestamps(
   strings_column_view const& input,
   data_type timestamp_type,
   std::string_view format,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Verifies the given strings column can be parsed to timestamps using the provided format
@@ -135,8 +136,8 @@ std::unique_ptr<column> to_timestamps(
 std::unique_ptr<column> is_timestamp(
   strings_column_view const& input,
   std::string_view format,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new strings column converting a timestamp column into
@@ -246,11 +247,11 @@ std::unique_ptr<column> is_timestamp(
  */
 std::unique_ptr<column> from_timestamps(
   column_view const& timestamps,
-  std::string_view format             = "%Y-%m-%dT%H:%M:%SZ",
-  strings_column_view const& names    = strings_column_view(column_view{
+  std::string_view format           = "%Y-%m-%dT%H:%M:%SZ",
+  strings_column_view const& names  = strings_column_view(column_view{
     data_type{type_id::STRING}, 0, nullptr, nullptr, 0}),
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_durations.hpp b/cpp/include/cudf/strings/convert/convert_durations.hpp
index a1f4e4ead1d..2db719a4f1f 100644
--- a/cpp/include/cudf/strings/convert/convert_durations.hpp
+++ b/cpp/include/cudf/strings/convert/convert_durations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -76,8 +77,8 @@ std::unique_ptr<column> to_durations(
   strings_column_view const& input,
   data_type duration_type,
   std::string_view format,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new strings column converting a duration column into
@@ -126,9 +127,9 @@ std::unique_ptr<column> to_durations(
  */
 std::unique_ptr<column> from_durations(
   column_view const& durations,
-  std::string_view format             = "%D days %H:%M:%S",
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  std::string_view format           = "%D days %H:%M:%S",
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
index 8f37715967a..9911bea1948 100644
--- a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
+++ b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -62,8 +63,8 @@ namespace strings {
 std::unique_ptr<column> to_fixed_point(
   strings_column_view const& input,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new strings column converting the fixed-point values
@@ -92,8 +93,8 @@ std::unique_ptr<column> to_fixed_point(
  */
 std::unique_ptr<column> from_fixed_point(
   column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -123,9 +124,9 @@ std::unique_ptr<column> from_fixed_point(
  */
 std::unique_ptr<column> is_fixed_point(
   strings_column_view const& input,
-  data_type decimal_type              = data_type{type_id::DECIMAL64},
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  data_type decimal_type            = data_type{type_id::DECIMAL64},
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_floats.hpp b/cpp/include/cudf/strings/convert/convert_floats.hpp
index a35cb68ef4e..feb5b528686 100644
--- a/cpp/include/cudf/strings/convert/convert_floats.hpp
+++ b/cpp/include/cudf/strings/convert/convert_floats.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -48,8 +49,8 @@ namespace strings {
 std::unique_ptr<column> to_floats(
   strings_column_view const& strings,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new strings column converting the float values from the
@@ -71,8 +72,8 @@ std::unique_ptr<column> to_floats(
  */
 std::unique_ptr<column> from_floats(
   column_view const& floats,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -97,8 +98,8 @@ std::unique_ptr<column> from_floats(
  */
 std::unique_ptr<column> is_float(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp
index 74ec5d315a2..82696811fdc 100644
--- a/cpp/include/cudf/strings/convert/convert_integers.hpp
+++ b/cpp/include/cudf/strings/convert/convert_integers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -55,8 +56,8 @@ namespace strings {
 std::unique_ptr<column> to_integers(
   strings_column_view const& input,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new strings column converting the integer values from the
@@ -76,8 +77,8 @@ std::unique_ptr<column> to_integers(
  */
 std::unique_ptr<column> from_integers(
   column_view const& integers,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -105,8 +106,8 @@ std::unique_ptr<column> from_integers(
  */
 std::unique_ptr<column> is_integer(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -139,8 +140,8 @@ std::unique_ptr<column> is_integer(
 std::unique_ptr<column> is_integer(
   strings_column_view const& input,
   data_type int_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new integer numeric column parsing hexadecimal values from the
@@ -169,8 +170,8 @@ std::unique_ptr<column> is_integer(
 std::unique_ptr<column> hex_to_integers(
   strings_column_view const& input,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -196,8 +197,8 @@ std::unique_ptr<column> hex_to_integers(
  */
 std::unique_ptr<column> is_hex(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new strings column converting integer columns to hexadecimal
@@ -229,8 +230,8 @@ std::unique_ptr<column> is_hex(
  */
 std::unique_ptr<column> integers_to_hex(
   column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_ipv4.hpp b/cpp/include/cudf/strings/convert/convert_ipv4.hpp
index 25ad7b86748..64f8a412ce9 100644
--- a/cpp/include/cudf/strings/convert/convert_ipv4.hpp
+++ b/cpp/include/cudf/strings/convert/convert_ipv4.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -55,8 +56,8 @@ namespace strings {
  */
 std::unique_ptr<column> ipv4_to_integers(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Converts integers into IPv4 addresses as strings.
@@ -80,8 +81,8 @@ std::unique_ptr<column> ipv4_to_integers(
  */
 std::unique_ptr<column> integers_to_ipv4(
   column_view const& integers,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -107,8 +108,8 @@ std::unique_ptr<column> integers_to_ipv4(
  */
 std::unique_ptr<column> is_ipv4(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_lists.hpp b/cpp/include/cudf/strings/convert/convert_lists.hpp
index dedf4e95138..a88bbe99492 100644
--- a/cpp/include/cudf/strings/convert/convert_lists.hpp
+++ b/cpp/include/cudf/strings/convert/convert_lists.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -63,7 +64,7 @@ std::unique_ptr<column> format_list_column(
   strings_column_view const& separators = strings_column_view(column_view{
     data_type{type_id::STRING}, 0, nullptr, nullptr, 0}),
   rmm::cuda_stream_view stream          = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr   = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr     = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_urls.hpp b/cpp/include/cudf/strings/convert/convert_urls.hpp
index 902835081af..30988d2ff0a 100644
--- a/cpp/include/cudf/strings/convert/convert_urls.hpp
+++ b/cpp/include/cudf/strings/convert/convert_urls.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -46,8 +47,8 @@ namespace strings {
  */
 std::unique_ptr<column> url_encode(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Encodes each string using URL encoding.
@@ -69,8 +70,8 @@ std::unique_ptr<column> url_encode(
  */
 std::unique_ptr<column> url_decode(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/combine.hpp b/cpp/include/cudf/strings/detail/combine.hpp
index 3b8ed0f4e0d..25214055787 100644
--- a/cpp/include/cudf/strings/detail/combine.hpp
+++ b/cpp/include/cudf/strings/detail/combine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -30,7 +31,7 @@ namespace detail {
 
 /**
  * @copydoc concatenate(table_view const&,string_scalar const&,string_scalar
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -39,11 +40,11 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& narep,
                                     separator_on_nulls separate_nulls,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc join_strings(table_view const&,string_scalar const&,string_scalar
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -51,11 +52,11 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
                                      string_scalar const& separator,
                                      string_scalar const& narep,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc join_list_elements(table_view const&,string_scalar const&,string_scalar
- * const&,separator_on_nulls,output_if_empty_list,rmm::mr::device_memory_resource*)
+ * const&,separator_on_nulls,output_if_empty_list,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -65,7 +66,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                            separator_on_nulls separate_nulls,
                                            output_if_empty_list empty_list_policy,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/concatenate.hpp b/cpp/include/cudf/strings/detail/concatenate.hpp
index 511e240886a..b5dd5b9516a 100644
--- a/cpp/include/cudf/strings/detail/concatenate.hpp
+++ b/cpp/include/cudf/strings/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -44,7 +45,7 @@ namespace detail {
  */
 std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/converters.hpp b/cpp/include/cudf/strings/detail/converters.hpp
index 3337815342c..d212239264b 100644
--- a/cpp/include/cudf/strings/detail/converters.hpp
+++ b/cpp/include/cudf/strings/detail/converters.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,63 +20,64 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
 namespace detail {
 
 /**
- * @copydoc to_integers(strings_column_view const&,data_type,rmm::mr::device_memory_resource*)
+ * @copydoc to_integers(strings_column_view const&,data_type,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> to_integers(strings_column_view const& strings,
                                     data_type output_type,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc from_integers(strings_column_view const&,rmm::mr::device_memory_resource*)
+ * @copydoc from_integers(strings_column_view const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> from_integers(column_view const& integers,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc to_floats(strings_column_view const&,data_type,rmm::mr::device_memory_resource*)
+ * @copydoc to_floats(strings_column_view const&,data_type,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> to_floats(strings_column_view const& strings,
                                   data_type output_type,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc from_floats(strings_column_view const&,rmm::mr::device_memory_resource*)
+ * @copydoc from_floats(strings_column_view const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> from_floats(column_view const& floats,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc to_booleans(strings_column_view const&,string_scalar
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> to_booleans(strings_column_view const& strings,
                                     string_scalar const& true_string,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc from_booleans(strings_column_view const&,string_scalar const&,string_scalar
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -84,11 +85,11 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
                                       string_scalar const& true_string,
                                       string_scalar const& false_string,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc to_timestamps(strings_column_view const&,data_type,std::string_view,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -96,11 +97,11 @@ std::unique_ptr<cudf::column> to_timestamps(strings_column_view const& strings,
                                             data_type timestamp_type,
                                             std::string_view format,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc from_timestamps(strings_column_view const&,std::string_view,
- * strings_column_view const&,rmm::mr::device_memory_resource*)
+ * strings_column_view const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -108,11 +109,11 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
                                         std::string_view format,
                                         strings_column_view const& names,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc to_durations(strings_column_view const&,data_type,std::string_view,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -120,37 +121,37 @@ std::unique_ptr<column> to_durations(strings_column_view const& strings,
                                      data_type duration_type,
                                      std::string_view format,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc from_durations(strings_column_view const&,std::string_view.
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> from_durations(column_view const& durations,
                                        std::string_view format,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc to_fixed_point(strings_column_view const&,data_type,rmm::mr::device_memory_resource*)
+ * @copydoc to_fixed_point(strings_column_view const&,data_type,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> to_fixed_point(strings_column_view const& strings,
                                        data_type output_type,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc from_fixed_point(strings_column_view const&,rmm::mr::device_memory_resource*)
+ * @copydoc from_fixed_point(strings_column_view const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> from_fixed_point(column_view const& integers,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh
index 08ba99e90d8..4db7651330b 100644
--- a/cpp/include/cudf/strings/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh
@@ -22,6 +22,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -59,7 +60,7 @@ std::unique_ptr<cudf::column> copy_if_else(StringIterLeft lhs_begin,
                                            StringIterRight rhs_begin,
                                            Filter filter_fn,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto strings_count = std::distance(lhs_begin, lhs_end);
   if (strings_count == 0) { return make_empty_column(type_id::STRING); }
diff --git a/cpp/include/cudf/strings/detail/copy_range.hpp b/cpp/include/cudf/strings/detail/copy_range.hpp
index e18f1fdc5ad..192c5b833c6 100644
--- a/cpp/include/cudf/strings/detail/copy_range.hpp
+++ b/cpp/include/cudf/strings/detail/copy_range.hpp
@@ -19,6 +19,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -53,7 +54,7 @@ std::unique_ptr<column> copy_range(strings_column_view const& source,
                                    size_type source_end,
                                    size_type target_begin,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/copying.hpp b/cpp/include/cudf/strings/detail/copying.hpp
index 7e82ad4c679..240cac17188 100644
--- a/cpp/include/cudf/strings/detail/copying.hpp
+++ b/cpp/include/cudf/strings/detail/copying.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -53,7 +54,7 @@ std::unique_ptr<cudf::column> copy_slice(strings_column_view const& strings,
                                          size_type start,
                                          size_type end,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @brief Returns a new strings column created by shifting the rows by a specified offset.
@@ -80,7 +81,7 @@ std::unique_ptr<column> shift(strings_column_view const& input,
                               size_type offset,
                               scalar const& fill_value,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/fill.hpp b/cpp/include/cudf/strings/detail/fill.hpp
index 43e3f6198f3..c5d005fbf75 100644
--- a/cpp/include/cudf/strings/detail/fill.hpp
+++ b/cpp/include/cudf/strings/detail/fill.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -47,7 +48,7 @@ std::unique_ptr<column> fill(strings_column_view const& strings,
                              size_type end,
                              string_scalar const& value,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr);
+                             rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 06d959acffb..94bce6bddd5 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/advance.h>
@@ -227,7 +228,7 @@ rmm::device_uvector<char> gather_chars(StringIterator strings_begin,
                                        cudf::detail::input_offsetalator const offsets,
                                        size_type chars_bytes,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto const output_count = std::distance(map_begin, map_end);
   if (output_count == 0) return rmm::device_uvector<char>(0, stream, mr);
@@ -290,7 +291,7 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
                                      MapIterator begin,
                                      MapIterator end,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   auto const output_count = std::distance(begin, end);
   if (output_count == 0) return make_empty_column(type_id::STRING);
@@ -354,7 +355,7 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
                                      MapIterator end,
                                      bool nullify_out_of_bounds,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   if (nullify_out_of_bounds) return gather<true>(strings, begin, end, stream, mr);
   return gather<false>(strings, begin, end, stream, mr);
diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh
index f05e957783f..457c2b7f740 100644
--- a/cpp/include/cudf/strings/detail/merge.cuh
+++ b/cpp/include/cudf/strings/detail/merge.cuh
@@ -26,6 +26,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -56,7 +57,7 @@ std::unique_ptr<column> merge(strings_column_view const& lhs,
                               row_order_iterator begin,
                               row_order_iterator end,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   using cudf::detail::side;
   size_type strings_count = static_cast<size_type>(std::distance(begin, end));
diff --git a/cpp/include/cudf/strings/detail/replace.hpp b/cpp/include/cudf/strings/detail/replace.hpp
index 0f050f057fa..aad89beb47e 100644
--- a/cpp/include/cudf/strings/detail/replace.hpp
+++ b/cpp/include/cudf/strings/detail/replace.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -28,24 +29,24 @@ namespace detail {
 
 /**
  * @copydoc cudf::strings::replace(strings_column_view const&, string_scalar const&,
- * string_scalar const&, int32_t, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+ * string_scalar const&, int32_t, rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
 std::unique_ptr<column> replace(strings_column_view const& strings,
                                 string_scalar const& target,
                                 string_scalar const& repl,
                                 int32_t maxrepl,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr);
+                                rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::strings::replace(strings_column_view const&, strings_column_view const&,
- * strings_column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+ * strings_column_view const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
 std::unique_ptr<column> replace(strings_column_view const& strings,
                                 strings_column_view const& targets,
                                 strings_column_view const& repls,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr);
+                                rmm::device_async_resource_ref mr);
 
 /**
  * @brief Replaces any null string entries with the given string.
@@ -68,18 +69,18 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
 std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
                                       string_scalar const& repl,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::strings::replace_slice(strings_column_view const&, string_scalar const&,
- * size_type, size_type, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+ * size_type, size_type, rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
 std::unique_ptr<column> replace_slice(strings_column_view const& strings,
                                       string_scalar const& repl,
                                       size_type start,
                                       size_type stop,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Return a copy of `input` replacing any `values_to_replace[i]`
@@ -97,7 +98,7 @@ std::unique_ptr<cudf::column> find_and_replace_all(
   cudf::strings_column_view const& values_to_replace,
   cudf::strings_column_view const& replacement_values,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/scan.hpp b/cpp/include/cudf/strings/detail/scan.hpp
index 611e32e28cd..f32afa64a72 100644
--- a/cpp/include/cudf/strings/detail/scan.hpp
+++ b/cpp/include/cudf/strings/detail/scan.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -40,7 +41,7 @@ template <typename Op>
 std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        bitmask_type const* mask,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh
index 8b8c11dcd5c..87f0e7ae47c 100644
--- a/cpp/include/cudf/strings/detail/scatter.cuh
+++ b/cpp/include/cudf/strings/detail/scatter.cuh
@@ -24,6 +24,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/distance.h>
@@ -63,7 +64,7 @@ std::unique_ptr<column> scatter(SourceIterator begin,
                                 MapIterator scatter_map,
                                 strings_column_view const& target,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   if (target.is_empty()) return make_empty_column(type_id::STRING);
 
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index 49c4be88ca5..7136df325f4 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -56,7 +57,7 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
                            size_type exec_size,
                            size_type strings_count,
                            rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
+                           rmm::device_async_resource_ref mr)
 {
   auto offsets_column = make_numeric_column(
     data_type{type_to_id<size_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
@@ -116,7 +117,7 @@ template <typename SizeAndExecuteFunction>
 auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
                            size_type strings_count,
                            rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
+                           rmm::device_async_resource_ref mr)
 {
   return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr);
 }
@@ -142,7 +143,7 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
   InputIterator begin,
   InputIterator end,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto constexpr size_type_max = static_cast<int64_t>(std::numeric_limits<size_type>::max());
   auto const lcount            = static_cast<int64_t>(std::distance(begin, end));
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index 8e19f08a5cc..079b6a73e0b 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/copy.h>
@@ -73,7 +74,7 @@ template <typename IndexPairIterator>
 std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
                                             IndexPairIterator end,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   size_type strings_count = thrust::distance(begin, end);
@@ -163,7 +164,7 @@ std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
                                             size_type null_count,
                                             rmm::device_buffer&& null_mask,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   size_type strings_count = thrust::distance(offsets_begin, offsets_end) - 1;
diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp
index cf9a13e9742..4467a9d0023 100644
--- a/cpp/include/cudf/strings/detail/utilities.hpp
+++ b/cpp/include/cudf/strings/detail/utilities.hpp
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -43,7 +44,7 @@ namespace detail {
 std::unique_ptr<column> create_offsets_child_column(int64_t chars_bytes,
                                                     size_type count,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr);
+                                                    rmm::device_async_resource_ref mr);
 
 /**
  * @brief Creates a string_view vector from a strings column.
@@ -56,7 +57,7 @@ std::unique_ptr<column> create_offsets_child_column(int64_t chars_bytes,
 rmm::device_uvector<string_view> create_string_vector_from_column(
   cudf::strings_column_view const strings,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Return the threshold size for a strings column to use int64 offsets
diff --git a/cpp/include/cudf/strings/extract.hpp b/cpp/include/cudf/strings/extract.hpp
index a4db1ac46da..4138e1e59d5 100644
--- a/cpp/include/cudf/strings/extract.hpp
+++ b/cpp/include/cudf/strings/extract.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/table/table.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -62,8 +63,8 @@ struct regex_program;
 std::unique_ptr<table> extract(
   strings_column_view const& input,
   regex_program const& prog,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a lists column of strings where each string column row corresponds to the
@@ -98,8 +99,8 @@ std::unique_ptr<table> extract(
 std::unique_ptr<column> extract_all_record(
   strings_column_view const& input,
   regex_program const& prog,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp
index c1aa8b294b3..c116dbc2fe1 100644
--- a/cpp/include/cudf/strings/find.hpp
+++ b/cpp/include/cudf/strings/find.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -55,10 +56,10 @@ namespace strings {
 std::unique_ptr<column> find(
   strings_column_view const& input,
   string_scalar const& target,
-  size_type start                     = 0,
-  size_type stop                      = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  size_type start                   = 0,
+  size_type stop                    = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of character position values where the target
@@ -86,10 +87,10 @@ std::unique_ptr<column> find(
 std::unique_ptr<column> rfind(
   strings_column_view const& input,
   string_scalar const& target,
-  size_type start                     = 0,
-  size_type stop                      = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  size_type start                   = 0,
+  size_type stop                    = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of character position values where the target
@@ -114,9 +115,9 @@ std::unique_ptr<column> rfind(
 std::unique_ptr<column> find(
   strings_column_view const& input,
   strings_column_view const& target,
-  size_type start                     = 0,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  size_type start                   = 0,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -136,8 +137,8 @@ std::unique_ptr<column> find(
 std::unique_ptr<column> contains(
   strings_column_view const& input,
   string_scalar const& target,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -161,8 +162,8 @@ std::unique_ptr<column> contains(
 std::unique_ptr<column> contains(
   strings_column_view const& input,
   strings_column_view const& targets,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -183,8 +184,8 @@ std::unique_ptr<column> contains(
 std::unique_ptr<column> starts_with(
   strings_column_view const& input,
   string_scalar const& target,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -209,8 +210,8 @@ std::unique_ptr<column> starts_with(
 std::unique_ptr<column> starts_with(
   strings_column_view const& input,
   strings_column_view const& targets,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -231,8 +232,8 @@ std::unique_ptr<column> starts_with(
 std::unique_ptr<column> ends_with(
   strings_column_view const& input,
   string_scalar const& target,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -257,8 +258,8 @@ std::unique_ptr<column> ends_with(
 std::unique_ptr<column> ends_with(
   strings_column_view const& input,
   strings_column_view const& targets,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp
index 06b851c5012..c2e82aa6f1a 100644
--- a/cpp/include/cudf/strings/find_multiple.hpp
+++ b/cpp/include/cudf/strings/find_multiple.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -57,8 +58,8 @@ namespace strings {
 std::unique_ptr<column> find_multiple(
   strings_column_view const& input,
   strings_column_view const& targets,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp
index 379b9624dc6..abc1d28ee4c 100644
--- a/cpp/include/cudf/strings/findall.hpp
+++ b/cpp/include/cudf/strings/findall.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/table/table.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -64,8 +65,8 @@ struct regex_program;
 std::unique_ptr<column> findall(
   strings_column_view const& input,
   regex_program const& prog,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/padding.hpp b/cpp/include/cudf/strings/padding.hpp
index f0cb351eeda..f1382d6ea29 100644
--- a/cpp/include/cudf/strings/padding.hpp
+++ b/cpp/include/cudf/strings/padding.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -58,10 +59,10 @@ namespace strings {
 std::unique_ptr<column> pad(
   strings_column_view const& input,
   size_type width,
-  side_type side                      = side_type::RIGHT,
-  std::string_view fill_char          = " ",
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  side_type side                    = side_type::RIGHT,
+  std::string_view fill_char        = " ",
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Add '0' as padding to the left of each string.
@@ -90,8 +91,8 @@ std::unique_ptr<column> pad(
 std::unique_ptr<column> zfill(
   strings_column_view const& input,
   size_type width,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp
index 7dc9c33f579..cbf1edc8331 100644
--- a/cpp/include/cudf/strings/repeat_strings.hpp
+++ b/cpp/include/cudf/strings/repeat_strings.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -59,8 +60,8 @@ namespace strings {
 std::unique_ptr<string_scalar> repeat_string(
   string_scalar const& input,
   size_type repeat_times,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Repeat each string in the given strings column a given number of times
@@ -90,8 +91,8 @@ std::unique_ptr<string_scalar> repeat_string(
 std::unique_ptr<column> repeat_strings(
   strings_column_view const& input,
   size_type repeat_times,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Repeat each string in the given strings column by the numbers of times given in another
@@ -127,8 +128,8 @@ std::unique_ptr<column> repeat_strings(
 std::unique_ptr<column> repeat_strings(
   strings_column_view const& input,
   column_view const& repeat_times,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp
index 2476a41e886..9525db44b69 100644
--- a/cpp/include/cudf/strings/replace.hpp
+++ b/cpp/include/cudf/strings/replace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -67,9 +68,9 @@ std::unique_ptr<column> replace(
   strings_column_view const& input,
   string_scalar const& target,
   string_scalar const& repl,
-  cudf::size_type maxrepl             = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  cudf::size_type maxrepl           = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief This function replaces each string in the column with the provided
@@ -107,11 +108,11 @@ std::unique_ptr<column> replace(
  */
 std::unique_ptr<column> replace_slice(
   strings_column_view const& input,
-  string_scalar const& repl           = string_scalar(""),
-  size_type start                     = 0,
-  size_type stop                      = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& repl         = string_scalar(""),
+  size_type start                   = 0,
+  size_type stop                    = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Replaces substrings matching a list of targets with the corresponding
@@ -156,8 +157,8 @@ std::unique_ptr<column> replace(
   strings_column_view const& input,
   strings_column_view const& targets,
   strings_column_view const& repls,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp
index 77db2882253..f61f9585144 100644
--- a/cpp/include/cudf/strings/replace_re.hpp
+++ b/cpp/include/cudf/strings/replace_re.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
@@ -59,7 +60,7 @@ std::unique_ptr<column> replace_re(
   string_scalar const& replacement           = string_scalar(""),
   std::optional<size_type> max_replace_count = std::nullopt,
   rmm::cuda_stream_view stream               = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr        = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr          = rmm::mr::get_current_device_resource());
 
 /**
  * @brief For each string, replaces any character sequence matching the given patterns
@@ -81,9 +82,9 @@ std::unique_ptr<column> replace_re(
   strings_column_view const& input,
   std::vector<std::string> const& patterns,
   strings_column_view const& replacements,
-  regex_flags const flags             = regex_flags::DEFAULT,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  regex_flags const flags           = regex_flags::DEFAULT,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief For each string, replaces any character sequence matching the given regex
@@ -107,8 +108,8 @@ std::unique_ptr<column> replace_with_backrefs(
   strings_column_view const& input,
   regex_program const& prog,
   std::string_view replacement,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/reverse.hpp b/cpp/include/cudf/strings/reverse.hpp
index 4fc8fbf67c2..86656693c8b 100644
--- a/cpp/include/cudf/strings/reverse.hpp
+++ b/cpp/include/cudf/strings/reverse.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -47,8 +48,8 @@ namespace strings {
  */
 std::unique_ptr<column> reverse(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/slice.hpp b/cpp/include/cudf/strings/slice.hpp
index f106663be9b..e2be6abd344 100644
--- a/cpp/include/cudf/strings/slice.hpp
+++ b/cpp/include/cudf/strings/slice.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -64,7 +65,7 @@ std::unique_ptr<column> slice_strings(
   numeric_scalar<size_type> const& stop  = numeric_scalar<size_type>(0, false),
   numeric_scalar<size_type> const& step  = numeric_scalar<size_type>(1),
   rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new strings column that contains substrings of the
@@ -108,8 +109,8 @@ std::unique_ptr<column> slice_strings(
   strings_column_view const& input,
   column_view const& starts,
   column_view const& stops,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/split/partition.hpp b/cpp/include/cudf/strings/split/partition.hpp
index 25eedf1e86b..0a837034ba1 100644
--- a/cpp/include/cudf/strings/split/partition.hpp
+++ b/cpp/include/cudf/strings/split/partition.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/table/table.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -60,9 +61,9 @@ namespace strings {
  */
 std::unique_ptr<table> partition(
   strings_column_view const& input,
-  string_scalar const& delimiter      = string_scalar(""),
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& delimiter    = string_scalar(""),
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a set of 3 columns by splitting each string using the
@@ -94,9 +95,9 @@ std::unique_ptr<table> partition(
  */
 std::unique_ptr<table> rpartition(
   strings_column_view const& input,
-  string_scalar const& delimiter      = string_scalar(""),
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& delimiter    = string_scalar(""),
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/split/split.hpp b/cpp/include/cudf/strings/split/split.hpp
index a34a59577a0..d5c44406ca7 100644
--- a/cpp/include/cudf/strings/split/split.hpp
+++ b/cpp/include/cudf/strings/split/split.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/table/table.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -54,10 +55,10 @@ namespace strings {
  */
 std::unique_ptr<table> split(
   strings_column_view const& strings_column,
-  string_scalar const& delimiter      = string_scalar(""),
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& delimiter    = string_scalar(""),
+  size_type maxsplit                = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a list of columns by splitting each string using the
@@ -84,10 +85,10 @@ std::unique_ptr<table> split(
  */
 std::unique_ptr<table> rsplit(
   strings_column_view const& strings_column,
-  string_scalar const& delimiter      = string_scalar(""),
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& delimiter    = string_scalar(""),
+  size_type maxsplit                = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Splits individual strings elements into a list of strings.
@@ -158,10 +159,10 @@ std::unique_ptr<table> rsplit(
  */
 std::unique_ptr<column> split_record(
   strings_column_view const& strings,
-  string_scalar const& delimiter      = string_scalar(""),
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& delimiter    = string_scalar(""),
+  size_type maxsplit                = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Splits individual strings elements into a list of strings starting
@@ -237,10 +238,10 @@ std::unique_ptr<column> split_record(
  */
 std::unique_ptr<column> rsplit_record(
   strings_column_view const& strings,
-  string_scalar const& delimiter      = string_scalar(""),
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& delimiter    = string_scalar(""),
+  size_type maxsplit                = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
index f1736cb7e0c..81595fa7ed4 100644
--- a/cpp/include/cudf/strings/split/split_re.hpp
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/table/table.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -82,9 +83,9 @@ struct regex_program;
 std::unique_ptr<table> split_re(
   strings_column_view const& input,
   regex_program const& prog,
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  size_type maxsplit                = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Splits strings elements into a table of strings columns using a
@@ -138,9 +139,9 @@ std::unique_ptr<table> split_re(
 std::unique_ptr<table> rsplit_re(
   strings_column_view const& input,
   regex_program const& prog,
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  size_type maxsplit                = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Splits strings elements into a list column of strings
@@ -196,9 +197,9 @@ std::unique_ptr<table> rsplit_re(
 std::unique_ptr<column> split_record_re(
   strings_column_view const& input,
   regex_program const& prog,
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  size_type maxsplit                = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Splits strings elements into a list column of strings using the given
@@ -256,9 +257,9 @@ std::unique_ptr<column> split_record_re(
 std::unique_ptr<column> rsplit_record_re(
   strings_column_view const& input,
   regex_program const& prog,
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  size_type maxsplit                = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/strip.hpp b/cpp/include/cudf/strings/strip.hpp
index 556d6805ac3..6fb9bbc45e6 100644
--- a/cpp/include/cudf/strings/strip.hpp
+++ b/cpp/include/cudf/strings/strip.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -63,10 +64,10 @@ namespace strings {
  */
 std::unique_ptr<column> strip(
   strings_column_view const& input,
-  side_type side                      = side_type::BOTH,
-  string_scalar const& to_strip       = string_scalar(""),
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  side_type side                    = side_type::BOTH,
+  string_scalar const& to_strip     = string_scalar(""),
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/translate.hpp b/cpp/include/cudf/strings/translate.hpp
index 4bd09352b09..9cd6b7d5974 100644
--- a/cpp/include/cudf/strings/translate.hpp
+++ b/cpp/include/cudf/strings/translate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <vector>
 
@@ -56,8 +57,8 @@ namespace strings {
 std::unique_ptr<column> translate(
   strings_column_view const& input,
   std::vector<std::pair<char_utf8, char_utf8>> const& chars_table,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Removes or keeps the specified character ranges in cudf::strings::filter_characters
@@ -101,10 +102,10 @@ enum class filter_type : bool {
 std::unique_ptr<column> filter_characters(
   strings_column_view const& input,
   std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> characters_to_filter,
-  filter_type keep_characters         = filter_type::KEEP,
-  string_scalar const& replacement    = string_scalar(""),
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  filter_type keep_characters       = filter_type::KEEP,
+  string_scalar const& replacement  = string_scalar(""),
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/wrap.hpp b/cpp/include/cudf/strings/wrap.hpp
index efdc3e62aff..c05c33fbac8 100644
--- a/cpp/include/cudf/strings/wrap.hpp
+++ b/cpp/include/cudf/strings/wrap.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -66,8 +67,8 @@ namespace strings {
 std::unique_ptr<column> wrap(
   strings_column_view const& input,
   size_type width,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/structs/detail/concatenate.hpp b/cpp/include/cudf/structs/detail/concatenate.hpp
index 82ccca188e2..5dc3169c0c4 100644
--- a/cpp/include/cudf/structs/detail/concatenate.hpp
+++ b/cpp/include/cudf/structs/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace structs {
 namespace detail {
@@ -50,7 +52,7 @@ namespace detail {
  */
 std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace structs
diff --git a/cpp/include/cudf/structs/detail/scan.hpp b/cpp/include/cudf/structs/detail/scan.hpp
index 531e0a6c65f..c97a8452ecd 100644
--- a/cpp/include/cudf/structs/detail/scan.hpp
+++ b/cpp/include/cudf/structs/detail/scan.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace structs {
@@ -38,7 +39,7 @@ namespace detail {
 template <typename Op>
 std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace structs
diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp
index 439b02c2d53..8efe6eb8c72 100644
--- a/cpp/include/cudf/table/table.hpp
+++ b/cpp/include/cudf/table/table.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -56,8 +57,8 @@ class table {
    * @param mr Device memory resource to use for all device memory allocations
    */
   explicit table(table const& other,
-                 rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
   /**
    * @brief Moves the contents from a vector of `unique_ptr`s to columns to
    * construct a new table.
@@ -75,8 +76,8 @@ class table {
    * @param mr Device memory resource used for allocating the device memory for the new columns
    */
   table(table_view view,
-        rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-        rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+        rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+        rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Returns the number of columns in the table
diff --git a/cpp/include/cudf/timezone.hpp b/cpp/include/cudf/timezone.hpp
index 56678c73811..7f65128526e 100644
--- a/cpp/include/cudf/timezone.hpp
+++ b/cpp/include/cudf/timezone.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <optional>
@@ -49,6 +50,6 @@ static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years;
 std::unique_ptr<table> make_timezone_transition_table(
   std::optional<std::string_view> tzif_dir,
   std::string_view timezone_name,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace cudf
diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index 49ec3d7c0d5..7bb9fb7a42e 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -20,6 +20,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -54,7 +55,7 @@ std::unique_ptr<column> transform(
   std::string const& unary_udf,
   data_type output_type,
   bool is_ptx,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a null_mask from `input` by converting `NaN` to null and
@@ -69,7 +70,7 @@ std::unique_ptr<column> transform(
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
   column_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Compute a new column by evaluating an expression tree on a table.
@@ -87,7 +88,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
 std::unique_ptr<column> compute_column(
   table_view const& table,
   ast::expression const& expr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a bitmask from a column of boolean elements.
@@ -106,7 +107,7 @@ std::unique_ptr<column> compute_column(
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
   column_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Encode the rows of the given table as integers
@@ -134,7 +135,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
  */
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
   cudf::table_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Encodes `input` by generating a new column for each value in `categories` indicating the
@@ -166,7 +167,7 @@ std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
 std::pair<std::unique_ptr<column>, table_view> one_hot_encode(
   column_view const& input,
   column_view const& categories,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a boolean column from given bitmask.
@@ -193,7 +194,7 @@ std::unique_ptr<column> mask_to_bools(
   bitmask_type const* bitmask,
   size_type begin_bit,
   size_type end_bit,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns an approximate cumulative size in bits of all columns in the `table_view` for
@@ -221,8 +222,7 @@ std::unique_ptr<column> mask_to_bools(
  * @return A 32-bit integer column containing the per-row bit counts
  */
 std::unique_ptr<column> row_bit_count(
-  table_view const& t,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  table_view const& t, rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns an approximate cumulative size in bits of all columns in the `table_view` for
@@ -245,7 +245,7 @@ std::unique_ptr<column> row_bit_count(
 std::unique_ptr<column> segmented_row_bit_count(
   table_view const& t,
   size_type segment_length,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/transpose.hpp b/cpp/include/cudf/transpose.hpp
index e5d083ae7b3..c01a04afe87 100644
--- a/cpp/include/cudf/transpose.hpp
+++ b/cpp/include/cudf/transpose.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 /**
@@ -44,7 +45,7 @@ namespace cudf {
  */
 std::pair<std::unique_ptr<column>, table_view> transpose(
   table_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
index 5ded22488c7..74c8bc67d3a 100644
--- a/cpp/include/cudf/unary.hpp
+++ b/cpp/include/cudf/unary.hpp
@@ -22,6 +22,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -147,8 +148,8 @@ enum class unary_operator : int32_t {
 std::unique_ptr<cudf::column> unary_operation(
   cudf::column_view const& input,
   cudf::unary_operator op,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a column of `type_id::BOOL8` elements where for every element in `input` `true`
@@ -163,8 +164,8 @@ std::unique_ptr<cudf::column> unary_operation(
  */
 std::unique_ptr<cudf::column> is_null(
   cudf::column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a column of `type_id::BOOL8` elements where for every element in `input` `true`
@@ -179,8 +180,8 @@ std::unique_ptr<cudf::column> is_null(
  */
 std::unique_ptr<cudf::column> is_valid(
   cudf::column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Casts data from dtype specified in input to dtype specified in output.
@@ -198,8 +199,8 @@ std::unique_ptr<cudf::column> is_valid(
 std::unique_ptr<column> cast(
   column_view const& input,
   data_type out_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a column of `type_id::BOOL8` elements indicating the presence of `NaN` values
@@ -216,8 +217,8 @@ std::unique_ptr<column> cast(
  */
 std::unique_ptr<column> is_nan(
   cudf::column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a column of `type_id::BOOL8` elements indicating the absence of `NaN` values
@@ -235,8 +236,8 @@ std::unique_ptr<column> is_nan(
  */
 std::unique_ptr<column> is_not_nan(
   cudf::column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index 14b94e061ae..18f75bbc842 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -23,6 +23,7 @@
 
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace test {
@@ -36,7 +37,7 @@ namespace test {
  * ```
  */
 class BaseFixture : public ::testing::Test {
-  rmm::mr::device_memory_resource* _mr{rmm::mr::get_current_device_resource()};
+  rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
 
  public:
   /**
@@ -44,7 +45,7 @@ class BaseFixture : public ::testing::Test {
    * all tests inheriting from this fixture
    * @return pointer to memory resource
    */
-  rmm::mr::device_memory_resource* mr() { return _mr; }
+  rmm::device_async_resource_ref mr() { return _mr; }
 };
 
 /**
@@ -57,7 +58,7 @@ class BaseFixture : public ::testing::Test {
  */
 template <typename T>
 class BaseFixtureWithParam : public ::testing::TestWithParam<T> {
-  rmm::mr::device_memory_resource* _mr{rmm::mr::get_current_device_resource()};
+  rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
 
  public:
   /**
@@ -65,7 +66,7 @@ class BaseFixtureWithParam : public ::testing::TestWithParam<T> {
    * all tests inheriting from this fixture
    * @return pointer to memory resource
    */
-  rmm::mr::device_memory_resource* mr() const { return _mr; }
+  rmm::device_async_resource_ref mr() const { return _mr; }
 };
 
 /**
diff --git a/cpp/include/nvtext/byte_pair_encoding.hpp b/cpp/include/nvtext/byte_pair_encoding.hpp
index 4d6d8335eac..375d44e367a 100644
--- a/cpp/include/nvtext/byte_pair_encoding.hpp
+++ b/cpp/include/nvtext/byte_pair_encoding.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace nvtext {
 
 /**
@@ -45,8 +47,8 @@ struct bpe_merge_pairs {
    * @param mr Device memory resource used to allocate the device memory
    */
   bpe_merge_pairs(std::unique_ptr<cudf::column>&& input,
-                  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new bpe merge pairs object
@@ -56,8 +58,8 @@ struct bpe_merge_pairs {
    * @param mr Device memory resource used to allocate the device memory
    */
   bpe_merge_pairs(cudf::strings_column_view const& input,
-                  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   ~bpe_merge_pairs();
   bpe_merge_pairs();
@@ -94,8 +96,8 @@ struct bpe_merge_pairs {
  */
 std::unique_ptr<bpe_merge_pairs> load_merge_pairs(
   cudf::strings_column_view const& merge_pairs,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Byte pair encode the input strings.
@@ -127,7 +129,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
   cudf::strings_column_view const& input,
   bpe_merge_pairs const& merges_pairs,
   cudf::string_scalar const& separator = cudf::string_scalar(" "),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/detail/generate_ngrams.hpp b/cpp/include/nvtext/detail/generate_ngrams.hpp
index 835124141d4..c4b89b6d495 100644
--- a/cpp/include/nvtext/detail/generate_ngrams.hpp
+++ b/cpp/include/nvtext/detail/generate_ngrams.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,20 +18,21 @@
 #include <nvtext/generate_ngrams.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace nvtext {
 namespace detail {
 
 /**
  * @copydoc hash_character_ngrams(cudf::strings_column_view const&,
- * cudf::size_type, rmm::mr::device_memory_resource*)
+ * cudf::size_type, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for allocating/copying device memory and launching kernels
  */
 std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
                                                     cudf::size_type ngrams,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr);
+                                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/detail/load_hash_file.hpp b/cpp/include/nvtext/detail/load_hash_file.hpp
index f4107adb07e..0c27981f80b 100644
--- a/cpp/include/nvtext/detail/load_hash_file.hpp
+++ b/cpp/include/nvtext/detail/load_hash_file.hpp
@@ -20,6 +20,7 @@
 #include <nvtext/subword_tokenize.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cstdint>
 #include <cstring>
@@ -43,7 +44,7 @@ namespace detail {
 std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
   std::string const& filename_hashed_vocabulary,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/detail/tokenize.hpp b/cpp/include/nvtext/detail/tokenize.hpp
index 80a6edc496b..d48027e4631 100644
--- a/cpp/include/nvtext/detail/tokenize.hpp
+++ b/cpp/include/nvtext/detail/tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,52 +21,53 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace nvtext {
 namespace detail {
 /**
  * @copydoc nvtext::tokenize(strings_column_view const&,string_scalar
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
                                        cudf::string_scalar const& delimiter,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc nvtext::tokenize(strings_column_view const&,strings_column_view
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
                                        cudf::strings_column_view const& delimiters,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc nvtext::count_tokens(strings_column_view const&, string_scalar
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
                                            cudf::string_scalar const& delimiter,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc nvtext::count_tokens(strings_column_view const&,strings_column_view
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
                                            cudf::strings_column_view const& delimiters,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/edit_distance.hpp b/cpp/include/nvtext/edit_distance.hpp
index 9a24662455b..bfdfb4d1a1c 100644
--- a/cpp/include/nvtext/edit_distance.hpp
+++ b/cpp/include/nvtext/edit_distance.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 //! NVText APIs
 namespace nvtext {
 /**
@@ -60,8 +62,8 @@ namespace nvtext {
 std::unique_ptr<cudf::column> edit_distance(
   cudf::strings_column_view const& input,
   cudf::strings_column_view const& targets,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Compute the edit distance between all the strings in the input column.
@@ -98,8 +100,8 @@ std::unique_ptr<cudf::column> edit_distance(
  */
 std::unique_ptr<cudf::column> edit_distance_matrix(
   cudf::strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp
index e3d667f0292..bebe2e46023 100644
--- a/cpp/include/nvtext/generate_ngrams.hpp
+++ b/cpp/include/nvtext/generate_ngrams.hpp
@@ -19,6 +19,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace nvtext {
 /**
  * @addtogroup nvtext_ngrams
@@ -58,8 +60,8 @@ std::unique_ptr<cudf::column> generate_ngrams(
   cudf::strings_column_view const& input,
   cudf::size_type ngrams,
   cudf::string_scalar const& separator,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Generates ngrams of characters within each string
@@ -86,9 +88,9 @@ std::unique_ptr<cudf::column> generate_ngrams(
  */
 std::unique_ptr<cudf::column> generate_character_ngrams(
   cudf::strings_column_view const& input,
-  cudf::size_type ngrams              = 2,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  cudf::size_type ngrams            = 2,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Hashes ngrams of characters within each string
@@ -121,9 +123,9 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
  */
 std::unique_ptr<cudf::column> hash_character_ngrams(
   cudf::strings_column_view const& input,
-  cudf::size_type ngrams              = 5,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  cudf::size_type ngrams            = 5,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/jaccard.hpp b/cpp/include/nvtext/jaccard.hpp
index 19d6c111200..649c17f0b1c 100644
--- a/cpp/include/nvtext/jaccard.hpp
+++ b/cpp/include/nvtext/jaccard.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace nvtext {
 /**
  * @addtogroup nvtext_jaccard
@@ -72,8 +74,8 @@ std::unique_ptr<cudf::column> jaccard_index(
   cudf::strings_column_view const& input1,
   cudf::strings_column_view const& input2,
   cudf::size_type width,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index 47c625b5079..7d3f6059454 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace nvtext {
 /**
  * @addtogroup nvtext_minhash
@@ -53,7 +55,7 @@ std::unique_ptr<cudf::column> minhash(
   cudf::numeric_scalar<uint32_t> seed = 0,
   cudf::size_type width               = 4,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the minhash values for each string per seed
@@ -83,9 +85,9 @@ std::unique_ptr<cudf::column> minhash(
 std::unique_ptr<cudf::column> minhash(
   cudf::strings_column_view const& input,
   cudf::device_span<uint32_t const> seeds,
-  cudf::size_type width               = 4,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  cudf::size_type width             = 4,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the minhash value for each string
@@ -114,7 +116,7 @@ std::unique_ptr<cudf::column> minhash64(
   cudf::numeric_scalar<uint64_t> seed = 0,
   cudf::size_type width               = 4,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the minhash values for each string per seed
@@ -144,9 +146,9 @@ std::unique_ptr<cudf::column> minhash64(
 std::unique_ptr<cudf::column> minhash64(
   cudf::strings_column_view const& input,
   cudf::device_span<uint64_t const> seeds,
-  cudf::size_type width               = 4,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  cudf::size_type width             = 4,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/ngrams_tokenize.hpp b/cpp/include/nvtext/ngrams_tokenize.hpp
index 9d76ef8689f..09ce323a7ae 100644
--- a/cpp/include/nvtext/ngrams_tokenize.hpp
+++ b/cpp/include/nvtext/ngrams_tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace nvtext {
 /**
  * @addtogroup nvtext_ngrams
@@ -80,8 +82,8 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
   cudf::size_type ngrams,
   cudf::string_scalar const& delimiter,
   cudf::string_scalar const& separator,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp
index 3cbff5c744b..e5967e78318 100644
--- a/cpp/include/nvtext/normalize.hpp
+++ b/cpp/include/nvtext/normalize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 //! NVText APIs
 namespace nvtext {
 /**
@@ -51,8 +53,8 @@ namespace nvtext {
  */
 std::unique_ptr<cudf::column> normalize_spaces(
   cudf::strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Normalizes strings characters for tokenizing.
@@ -102,8 +104,8 @@ std::unique_ptr<cudf::column> normalize_spaces(
 std::unique_ptr<cudf::column> normalize_characters(
   cudf::strings_column_view const& input,
   bool do_lower_case,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/replace.hpp b/cpp/include/nvtext/replace.hpp
index 88cf7d41901..aac21346c72 100644
--- a/cpp/include/nvtext/replace.hpp
+++ b/cpp/include/nvtext/replace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 //! NVText APIs
 namespace nvtext {
 /**
@@ -88,7 +90,7 @@ std::unique_ptr<cudf::column> replace_tokens(
   cudf::strings_column_view const& replacements,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Removes tokens whose lengths are less than a specified number of characters.
@@ -137,7 +139,7 @@ std::unique_ptr<cudf::column> filter_tokens(
   cudf::string_scalar const& replacement = cudf::string_scalar{""},
   cudf::string_scalar const& delimiter   = cudf::string_scalar{""},
   rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/stemmer.hpp b/cpp/include/nvtext/stemmer.hpp
index 0e1759fdc5a..20b81aba661 100644
--- a/cpp/include/nvtext/stemmer.hpp
+++ b/cpp/include/nvtext/stemmer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace nvtext {
 /**
  * @addtogroup nvtext_stemmer
@@ -79,8 +81,8 @@ std::unique_ptr<cudf::column> is_letter(
   cudf::strings_column_view const& input,
   letter_type ltype,
   cudf::size_type character_index,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns boolean column indicating if character at `indices[i]` of `input[i]`
@@ -132,8 +134,8 @@ std::unique_ptr<cudf::column> is_letter(
   cudf::strings_column_view const& input,
   letter_type ltype,
   cudf::column_view const& indices,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the Porter Stemmer measurements of a strings column.
@@ -166,8 +168,8 @@ std::unique_ptr<cudf::column> is_letter(
  */
 std::unique_ptr<cudf::column> porter_stemmer_measure(
   cudf::strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp
index 72a899d70b4..a4e06495a1d 100644
--- a/cpp/include/nvtext/subword_tokenize.hpp
+++ b/cpp/include/nvtext/subword_tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace nvtext {
 
 /**
@@ -65,7 +67,7 @@ struct hashed_vocabulary {
  */
 std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
   std::string const& filename_hashed_vocabulary,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Result object for the subword_tokenize functions.
@@ -155,7 +157,7 @@ tokenizer_result subword_tokenize(
   uint32_t stride,
   bool do_lower_case,
   bool do_truncate,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp
index 107fefcc3bf..ea1b9c716f0 100644
--- a/cpp/include/nvtext/tokenize.hpp
+++ b/cpp/include/nvtext/tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace nvtext {
 /**
  * @addtogroup nvtext_tokenize
@@ -60,7 +62,7 @@ std::unique_ptr<cudf::column> tokenize(
   cudf::strings_column_view const& input,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a single column of strings by tokenizing the input strings
@@ -95,8 +97,8 @@ std::unique_ptr<cudf::column> tokenize(
 std::unique_ptr<cudf::column> tokenize(
   cudf::strings_column_view const& input,
   cudf::strings_column_view const& delimiters,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the number of tokens in each string of a strings column.
@@ -127,7 +129,7 @@ std::unique_ptr<cudf::column> count_tokens(
   cudf::strings_column_view const& input,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the number of tokens in each string of a strings column
@@ -158,8 +160,8 @@ std::unique_ptr<cudf::column> count_tokens(
 std::unique_ptr<cudf::column> count_tokens(
   cudf::strings_column_view const& input,
   cudf::strings_column_view const& delimiters,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a single column of strings by converting each character to a string.
@@ -183,8 +185,8 @@ std::unique_ptr<cudf::column> count_tokens(
  */
 std::unique_ptr<cudf::column> character_tokenize(
   cudf::strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a strings column from a strings column of tokens and an
@@ -225,7 +227,7 @@ std::unique_ptr<cudf::column> detokenize(
   cudf::column_view const& row_indices,
   cudf::string_scalar const& separator = cudf::string_scalar(" "),
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Vocabulary object to be used with nvtext::tokenize_with_vocabulary
@@ -246,8 +248,8 @@ struct tokenize_vocabulary {
    * @param mr Device memory resource used to allocate the returned column's device memory
    */
   tokenize_vocabulary(cudf::strings_column_view const& input,
-                      rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                      rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                      rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
   ~tokenize_vocabulary();
 
   struct tokenize_vocabulary_impl;
@@ -269,8 +271,8 @@ struct tokenize_vocabulary {
  */
 std::unique_ptr<tokenize_vocabulary> load_vocabulary(
   cudf::strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the token ids for the input string by looking up each delimited
@@ -301,9 +303,9 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(
   cudf::strings_column_view const& input,
   tokenize_vocabulary const& vocabulary,
   cudf::string_scalar const& delimiter,
-  cudf::size_type default_id          = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  cudf::size_type default_id        = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of tokenize group
 }  // namespace nvtext
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index be91c3b4d08..e39a2bb3ae8 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -39,6 +39,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/optional.h>
 
@@ -56,7 +57,7 @@ std::pair<rmm::device_buffer, size_type> scalar_col_valid_mask_and(
   column_view const& col,
   scalar const& s,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   if (col.is_empty()) return std::pair(rmm::device_buffer{0, stream, mr}, 0);
 
@@ -179,7 +180,7 @@ void fixed_point_binary_operation_validation(binary_operator op,
 
 /**
  * @copydoc cudf::binary_operation(column_view const&, column_view const&,
- * binary_operator, data_type, rmm::mr::device_memory_resource*)
+ * binary_operator, data_type, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -189,7 +190,7 @@ std::unique_ptr<column> binary_operation(LhsType const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   if constexpr (std::is_same_v<LhsType, column_view> and std::is_same_v<RhsType, column_view>)
     CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match");
@@ -250,7 +251,7 @@ std::unique_ptr<column> make_fixed_width_column_for_output(scalar const& lhs,
                                                            binary_operator op,
                                                            data_type output_type,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr)
+                                                           rmm::device_async_resource_ref mr)
 {
   if (binops::is_null_dependent(op)) {
     return make_fixed_width_column(output_type, rhs.size(), mask_state::ALL_VALID, stream, mr);
@@ -277,7 +278,7 @@ std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lh
                                                            binary_operator op,
                                                            data_type output_type,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr)
+                                                           rmm::device_async_resource_ref mr)
 {
   if (binops::is_null_dependent(op)) {
     return make_fixed_width_column(output_type, lhs.size(), mask_state::ALL_VALID, stream, mr);
@@ -304,7 +305,7 @@ std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lh
                                                            binary_operator op,
                                                            data_type output_type,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr)
+                                                           rmm::device_async_resource_ref mr)
 {
   if (binops::is_null_dependent(op)) {
     return make_fixed_width_column(output_type, rhs.size(), mask_state::ALL_VALID, stream, mr);
@@ -320,7 +321,7 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   return binops::compiled::binary_operation<scalar, column_view>(
     lhs, rhs, op, output_type, stream, mr);
@@ -330,7 +331,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   return binops::compiled::binary_operation<column_view, scalar>(
     lhs, rhs, op, output_type, stream, mr);
@@ -340,7 +341,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   return binops::compiled::binary_operation<column_view, column_view>(
     lhs, rhs, op, output_type, stream, mr);
@@ -351,7 +352,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          std::string const& ptx,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   // Check for datatype
   auto is_type_supported_ptx = [](data_type type) -> bool {
@@ -405,7 +406,7 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
@@ -415,7 +416,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
@@ -425,7 +426,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
@@ -436,7 +437,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          std::string const& ptx,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::binary_operation(lhs, rhs, ptx, output_type, stream, mr);
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index 1429635b803..d3257fadb1d 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -27,6 +27,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/functional.h>
@@ -50,7 +51,7 @@ struct scalar_as_column_view {
   template <typename T, CUDF_ENABLE_IF(is_fixed_width<T>())>
   return_type operator()(scalar const& s,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource*)
+                         rmm::device_async_resource_ref)
   {
     auto& h_scalar_type_view = static_cast<cudf::scalar_type_t<T>&>(const_cast<scalar&>(s));
     auto col_v               = column_view(s.type(),
@@ -61,7 +62,7 @@ struct scalar_as_column_view {
     return std::pair{col_v, std::unique_ptr<column>(nullptr)};
   }
   template <typename T, CUDF_ENABLE_IF(!is_fixed_width<T>())>
-  return_type operator()(scalar const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+  return_type operator()(scalar const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Unsupported type");
   }
@@ -69,7 +70,7 @@ struct scalar_as_column_view {
 // specialization for cudf::string_view
 template <>
 scalar_as_column_view::return_type scalar_as_column_view::operator()<cudf::string_view>(
-  scalar const& s, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  scalar const& s, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   using T                  = cudf::string_view;
   auto& h_scalar_type_view = static_cast<cudf::scalar_type_t<T>&>(const_cast<scalar&>(s));
@@ -96,7 +97,7 @@ scalar_as_column_view::return_type scalar_as_column_view::operator()<cudf::strin
 // specializing for struct column
 template <>
 scalar_as_column_view::return_type scalar_as_column_view::operator()<cudf::struct_view>(
-  scalar const& s, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  scalar const& s, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   auto col = make_column_from_scalar(s, 1, stream, mr);
   return std::pair{col->view(), std::move(col)};
@@ -114,7 +115,7 @@ scalar_as_column_view::return_type scalar_as_column_view::operator()<cudf::struc
 auto scalar_to_column_view(
   scalar const& scal,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   return type_dispatcher(scal.type(), scalar_as_column_view{}, scal, stream, mr);
 }
@@ -216,7 +217,7 @@ struct null_considering_binop {
                                      data_type output_type,
                                      cudf::size_type col_size,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     // Create device views for inputs
     auto const lhs_dev_view = get_device_view(lhs);
@@ -263,7 +264,7 @@ std::unique_ptr<column> string_null_min_max(scalar const& lhs,
                                             binary_operator op,
                                             data_type output_type,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   // hard-coded to only work with cudf::string_view so we don't explode compile times
   CUDF_EXPECTS(lhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported lhs datatype");
@@ -280,7 +281,7 @@ std::unique_ptr<column> string_null_min_max(column_view const& lhs,
                                             binary_operator op,
                                             data_type output_type,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   // hard-coded to only work with cudf::string_view so we don't explode compile times
   CUDF_EXPECTS(lhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported lhs datatype");
@@ -297,7 +298,7 @@ std::unique_ptr<column> string_null_min_max(column_view const& lhs,
                                             binary_operator op,
                                             data_type output_type,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   // hard-coded to only work with cudf::string_view so we don't explode compile times
   CUDF_EXPECTS(lhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported lhs datatype");
diff --git a/cpp/src/binaryop/compiled/binary_ops.hpp b/cpp/src/binaryop/compiled/binary_ops.hpp
index 47fd50c5d97..c7eb08cd133 100644
--- a/cpp/src/binaryop/compiled/binary_ops.hpp
+++ b/cpp/src/binaryop/compiled/binary_ops.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
@@ -37,21 +38,21 @@ std::unique_ptr<column> string_null_min_max(scalar const& lhs,
                                             binary_operator op,
                                             data_type output_type,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> string_null_min_max(column_view const& lhs,
                                             scalar const& rhs,
                                             binary_operator op,
                                             data_type output_type,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> string_null_min_max(column_view const& lhs,
                                             column_view const& rhs,
                                             binary_operator op,
                                             data_type output_type,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Performs a binary operation between a string scalar and a string
@@ -77,7 +78,7 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @brief Performs a binary operation between a string column and a string
@@ -103,7 +104,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @brief Performs a binary operation between two string columns.
@@ -128,7 +129,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 void binary_operation(mutable_column_view& out,
                       scalar const& lhs,
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 806beeb4efe..4da2e502ce6 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -33,6 +33,7 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <thrust/binary_search.h>
@@ -79,7 +80,7 @@ namespace detail {
 rmm::device_buffer create_null_mask(size_type size,
                                     mask_state state,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   size_type mask_size{0};
 
@@ -157,7 +158,7 @@ void set_null_mask(bitmask_type* bitmask,
 rmm::device_buffer create_null_mask(size_type size,
                                     mask_state state,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   return detail::create_null_mask(size, state, stream, mr);
 }
@@ -211,7 +212,7 @@ rmm::device_buffer copy_bitmask(bitmask_type const* mask,
                                 size_type begin_bit,
                                 size_type end_bit,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(begin_bit >= 0, "Invalid range.");
@@ -235,7 +236,7 @@ rmm::device_buffer copy_bitmask(bitmask_type const* mask,
 // Create a bitmask from a column view
 rmm::device_buffer copy_bitmask(column_view const& view,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   rmm::device_buffer null_mask{0, stream, mr};
@@ -432,7 +433,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(host_span<bitmask_type cons
                                                      host_span<size_type const> begin_bits,
                                                      size_type mask_size,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   return bitmask_binop(
     [] __device__(bitmask_type left, bitmask_type right) { return left & right; },
@@ -446,7 +447,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(host_span<bitmask_type cons
 // Returns the bitwise AND of the null masks of all columns in the table view
 std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   rmm::device_buffer null_mask{0, stream, mr};
@@ -479,7 +480,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
 // Returns the bitwise OR of the null masks of all columns in the table view
 std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   rmm::device_buffer null_mask{0, stream, mr};
@@ -512,7 +513,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
 void set_all_valid_null_masks(column_view const& input,
                               column& output,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   if (input.nullable()) {
     auto mask = detail::create_null_mask(output.size(), mask_state::ALL_VALID, stream, mr);
@@ -531,7 +532,7 @@ rmm::device_buffer copy_bitmask(bitmask_type const* mask,
                                 size_type begin_bit,
                                 size_type end_bit,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::copy_bitmask(mask, begin_bit, end_bit, stream, mr);
@@ -540,7 +541,7 @@ rmm::device_buffer copy_bitmask(bitmask_type const* mask,
 // Create a bitmask from a column view
 rmm::device_buffer copy_bitmask(column_view const& view,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::copy_bitmask(view, stream, mr);
@@ -548,7 +549,7 @@ rmm::device_buffer copy_bitmask(column_view const& view,
 
 std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::bitmask_and(view, stream, mr);
@@ -556,7 +557,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
 
 std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::bitmask_or(view, stream, mr);
diff --git a/cpp/src/column/column.cu b/cpp/src/column/column.cu
index d4a8fff69e2..90f719b9516 100644
--- a/cpp/src/column/column.cu
+++ b/cpp/src/column/column.cu
@@ -35,6 +35,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -46,9 +47,7 @@
 namespace cudf {
 
 // Copy ctor w/ optional stream/mr
-column::column(column const& other,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+column::column(column const& other, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   : _type{other._type},
     _size{other._size},
     _data{other._data, stream, mr},
@@ -160,7 +159,7 @@ namespace {
 struct create_column_from_view {
   cudf::column_view view;
   rmm::cuda_stream_view stream{cudf::get_default_stream()};
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 
   template <typename ColumnType,
             std::enable_if_t<std::is_same_v<ColumnType, cudf::string_view>>* = nullptr>
@@ -254,7 +253,7 @@ struct create_column_from_view {
 }  // anonymous namespace
 
 // Copy from a view
-column::column(column_view view, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+column::column(column_view view, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   :  // Move is needed here because the dereference operator of unique_ptr returns
      // an lvalue reference, which would otherwise dispatch to the copy constructor
     column{std::move(*type_dispatcher(view.type(), create_column_from_view{view, stream, mr}))}
diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp
index d8da6a95aa4..e40056fc8a1 100644
--- a/cpp/src/column/column_factories.cpp
+++ b/cpp/src/column/column_factories.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/iterator/constant_iterator.h>
 
 namespace cudf {
@@ -75,7 +77,7 @@ std::unique_ptr<column> make_numeric_column(data_type type,
                                             size_type size,
                                             mask_state state,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type.");
@@ -95,7 +97,7 @@ std::unique_ptr<column> make_fixed_point_column(data_type type,
                                                 size_type size,
                                                 mask_state state,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type.");
@@ -115,7 +117,7 @@ std::unique_ptr<column> make_timestamp_column(data_type type,
                                               size_type size,
                                               mask_state state,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.");
@@ -135,7 +137,7 @@ std::unique_ptr<column> make_duration_column(data_type type,
                                              size_type size,
                                              mask_state state,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.");
@@ -155,7 +157,7 @@ std::unique_ptr<column> make_fixed_width_column(data_type type,
                                                 size_type size,
                                                 mask_state state,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(is_fixed_width(type), "Invalid, non-fixed-width type.");
@@ -171,7 +173,7 @@ std::unique_ptr<column> make_fixed_width_column(data_type type,
 std::unique_ptr<column> make_dictionary_from_scalar(scalar const& s,
                                                     size_type size,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   if (size == 0) return make_empty_column(type_id::DICTIONARY32);
   CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
diff --git a/cpp/src/column/column_factories.cu b/cpp/src/column/column_factories.cu
index 0e65a131e67..bad20d6817c 100644
--- a/cpp/src/column/column_factories.cu
+++ b/cpp/src/column/column_factories.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/detail/fill.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/iterator/constant_iterator.h>
 
 namespace cudf {
@@ -33,7 +35,7 @@ struct column_from_scalar_dispatch {
   std::unique_ptr<cudf::column> operator()(scalar const& value,
                                            size_type size,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr) const
+                                           rmm::device_async_resource_ref mr) const
   {
     if (size == 0) return make_empty_column(value.type());
     if (!value.is_valid(stream))
@@ -51,7 +53,7 @@ std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::stri
   scalar const& value,
   size_type size,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+  rmm::device_async_resource_ref mr) const
 {
   if (size == 0) return make_empty_column(value.type());
 
@@ -68,7 +70,7 @@ std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::stri
 
 template <>
 std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::dictionary32>(
-  scalar const&, size_type, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) const
+  scalar const&, size_type, rmm::cuda_stream_view, rmm::device_async_resource_ref) const
 {
   CUDF_FAIL("dictionary not supported when creating from scalar");
 }
@@ -78,7 +80,7 @@ std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::list
   scalar const& value,
   size_type size,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+  rmm::device_async_resource_ref mr) const
 {
   auto lv = static_cast<list_scalar const*>(&value);
   return lists::detail::make_lists_column_from_scalar(*lv, size, stream, mr);
@@ -89,7 +91,7 @@ std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::stru
   scalar const& value,
   size_type size,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+  rmm::device_async_resource_ref mr) const
 {
   if (size == 0) CUDF_FAIL("0-length struct column is unsupported.");
   auto& ss  = static_cast<scalar_type_t<cudf::struct_view> const&>(value);
@@ -113,7 +115,7 @@ std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::stru
 std::unique_ptr<column> make_column_from_scalar(scalar const& s,
                                                 size_type size,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(s.type(), column_from_scalar_dispatch{}, s, size, stream, mr);
 }
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index b1d850e0b27..7c57be8e7c0 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -33,6 +33,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/advance.h>
 #include <thrust/binary_search.h>
@@ -241,7 +242,7 @@ template <typename T>
 std::unique_ptr<column> fused_concatenate(host_span<column_view const> views,
                                           bool const has_nulls,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   using mask_policy = cudf::mask_allocation_policy;
 
@@ -288,7 +289,7 @@ template <typename T>
 std::unique_ptr<column> for_each_concatenate(host_span<column_view const> views,
                                              bool const has_nulls,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   size_type const total_element_count =
     std::accumulate(views.begin(), views.end(), 0, [](auto accumulator, auto const& v) {
@@ -321,7 +322,7 @@ std::unique_ptr<column> for_each_concatenate(host_span<column_view const> views,
 struct concatenate_dispatch {
   host_span<column_view const> views;
   rmm::cuda_stream_view stream;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 
   // fixed width
   template <typename T>
@@ -485,7 +486,7 @@ void bounds_and_type_check(host_span<column_view const> cols, rmm::cuda_stream_v
 // Concatenates the elements from a vector of column_views
 std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_concat,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(not columns_to_concat.empty(), "Unexpected empty list of columns to concatenate.");
 
@@ -504,7 +505,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_conc
 
 std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   if (tables_to_concat.empty()) { return std::make_unique<table>(); }
 
@@ -533,7 +534,7 @@ std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
 
 rmm::device_buffer concatenate_masks(host_span<column_view const> views,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   bool const has_nulls =
     std::any_of(views.begin(), views.end(), [](column_view const col) { return col.has_nulls(); });
@@ -558,7 +559,7 @@ rmm::device_buffer concatenate_masks(host_span<column_view const> views,
 
 rmm::device_buffer concatenate_masks(host_span<column_view const> views,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::concatenate_masks(views, stream, mr);
@@ -567,7 +568,7 @@ rmm::device_buffer concatenate_masks(host_span<column_view const> views,
 // Concatenates the elements from a vector of column_views
 std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_concat,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::concatenate(columns_to_concat, stream, mr);
@@ -575,7 +576,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_conc
 
 std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::concatenate(tables_to_concat, stream, mr);
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 23bcd344a32..37db2c74790 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -31,6 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -48,6 +49,7 @@
 
 #include <cstddef>
 #include <numeric>
+#include <optional>
 #include <stdexcept>
 
 namespace cudf {
@@ -988,7 +990,7 @@ struct packed_split_indices_and_src_buf_info {
                                         std::size_t num_partitions,
                                         cudf::size_type num_src_bufs,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* temp_mr)
+                                        rmm::device_async_resource_ref temp_mr)
     : indices_size(
         cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align)),
       src_buf_info_size(
@@ -1046,7 +1048,7 @@ struct packed_partition_buf_size_and_dst_buf_info {
   packed_partition_buf_size_and_dst_buf_info(std::size_t num_partitions,
                                              std::size_t num_bufs,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* temp_mr)
+                                             rmm::device_async_resource_ref temp_mr)
     : stream(stream),
       buf_sizes_size{cudf::util::round_up_safe(num_partitions * sizeof(std::size_t), split_align)},
       dst_buf_info_size{cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align)},
@@ -1097,7 +1099,7 @@ struct packed_src_and_dst_pointers {
                               std::size_t num_partitions,
                               cudf::size_type num_src_bufs,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* temp_mr)
+                              rmm::device_async_resource_ref temp_mr)
     : stream(stream),
       src_bufs_size{cudf::util::round_up_safe(num_src_bufs * sizeof(uint8_t*), split_align)},
       dst_bufs_size{cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align)},
@@ -1158,7 +1160,7 @@ std::unique_ptr<packed_src_and_dst_pointers> setup_src_and_dst_pointers(
   cudf::size_type num_src_bufs,
   std::vector<rmm::device_buffer>& out_buffers,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* temp_mr)
+  rmm::device_async_resource_ref temp_mr)
 {
   auto src_and_dst_pointers = std::make_unique<packed_src_and_dst_pointers>(
     input, num_partitions, num_src_bufs, stream, temp_mr);
@@ -1195,7 +1197,7 @@ std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> compute_splits(
   cudf::size_type num_src_bufs,
   std::size_t num_bufs,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* temp_mr)
+  rmm::device_async_resource_ref temp_mr)
 {
   auto partition_buf_size_and_dst_buf_info =
     std::make_unique<packed_partition_buf_size_and_dst_buf_info>(
@@ -1366,7 +1368,7 @@ struct chunk_iteration_state {
     std::size_t num_partitions,
     std::size_t user_buffer_size,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* temp_mr);
+    rmm::device_async_resource_ref temp_mr);
 
   /**
    * @brief As of the time of the call, return the starting 1MB batch index, and the
@@ -1426,7 +1428,7 @@ std::unique_ptr<chunk_iteration_state> chunk_iteration_state::create(
   std::size_t num_partitions,
   std::size_t user_buffer_size,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* temp_mr)
+  rmm::device_async_resource_ref temp_mr)
 {
   rmm::device_uvector<size_type> d_batch_offsets(num_bufs + 1, stream, temp_mr);
 
@@ -1646,7 +1648,7 @@ std::unique_ptr<chunk_iteration_state> compute_batches(int num_bufs,
                                                        std::size_t num_partitions,
                                                        std::size_t user_buffer_size,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* temp_mr)
+                                                       rmm::device_async_resource_ref temp_mr)
 {
   // Since we parallelize at one block per copy, performance is vulnerable to situations where we
   // have small numbers of copies to do (a combination of small numbers of splits and/or columns),
@@ -1769,8 +1771,8 @@ struct contiguous_split_state {
   contiguous_split_state(cudf::table_view const& input,
                          std::size_t user_buffer_size,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr,
-                         rmm::mr::device_memory_resource* temp_mr)
+                         std::optional<rmm::device_async_resource_ref> mr,
+                         rmm::device_async_resource_ref temp_mr)
     : contiguous_split_state(input, {}, user_buffer_size, stream, mr, temp_mr)
   {
   }
@@ -1778,8 +1780,8 @@ struct contiguous_split_state {
   contiguous_split_state(cudf::table_view const& input,
                          std::vector<size_type> const& splits,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr,
-                         rmm::mr::device_memory_resource* temp_mr)
+                         std::optional<rmm::device_async_resource_ref> mr,
+                         rmm::device_async_resource_ref temp_mr)
     : contiguous_split_state(input, splits, 0, stream, mr, temp_mr)
   {
   }
@@ -1897,8 +1899,8 @@ struct contiguous_split_state {
                          std::vector<size_type> const& splits,
                          std::size_t user_buffer_size,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr,
-                         rmm::mr::device_memory_resource* temp_mr)
+                         std::optional<rmm::device_async_resource_ref> mr,
+                         rmm::device_async_resource_ref temp_mr)
     : input(input),
       user_buffer_size(user_buffer_size),
       stream(stream),
@@ -1936,7 +1938,8 @@ struct contiguous_split_state {
       std::transform(h_buf_sizes,
                      h_buf_sizes + num_partitions,
                      std::back_inserter(out_buffers),
-                     [stream = stream, mr = mr](std::size_t bytes) {
+                     [stream = stream,
+                      mr = mr.value_or(rmm::mr::get_current_device_resource())](std::size_t bytes) {
                        return rmm::device_buffer{bytes, stream, mr};
                      });
     }
@@ -2014,11 +2017,11 @@ struct contiguous_split_state {
   cudf::table_view const input;        ///< The input table_view to operate on
   std::size_t const user_buffer_size;  ///< The size of the user buffer for the chunked_pack case
   rmm::cuda_stream_view const stream;
-  rmm::mr::device_memory_resource* const mr;  ///< The memory resource for any data returned
+  std::optional<rmm::device_async_resource_ref const> mr;  ///< The resource for any data returned
 
   // this resource defaults to `mr` for the contiguous_split case, but it can be useful for the
   // `chunked_pack` case to allocate scratch/temp memory in a pool
-  rmm::mr::device_memory_resource* const temp_mr;  ///< The memory resource for scratch/temp space
+  rmm::device_async_resource_ref const temp_mr;  ///< The memory resource for scratch/temp space
 
   // whether the table was empty to begin with (0 rows or 0 columns) and should be metadata-only
   bool const is_empty;  ///< True if the source table has 0 rows or 0 columns
@@ -2062,7 +2065,7 @@ struct contiguous_split_state {
 std::vector<packed_table> contiguous_split(cudf::table_view const& input,
                                            std::vector<size_type> const& splits,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   // `temp_mr` is the same as `mr` for contiguous_split as it allocates all
   // of its memory from the default memory resource in cuDF
@@ -2075,7 +2078,7 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
 
 std::vector<packed_table> contiguous_split(cudf::table_view const& input,
                                            std::vector<size_type> const& splits,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::contiguous_split(input, splits, cudf::get_default_stream(), mr);
@@ -2083,14 +2086,14 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
 
 chunked_pack::chunked_pack(cudf::table_view const& input,
                            std::size_t user_buffer_size,
-                           rmm::mr::device_memory_resource* temp_mr)
+                           rmm::device_async_resource_ref temp_mr)
 {
   CUDF_EXPECTS(user_buffer_size >= desired_batch_size,
                "The output buffer size must be at least 1MB in size");
-  // We pass `nullptr` for the first `mr` in `contiguous_split_state` to indicate
+  // We pass `std::nullopt` for the first `mr` in `contiguous_split_state` to indicate
   // that it does not allocate any user-bound data for the `chunked_pack` case.
   state = std::make_unique<detail::contiguous_split_state>(
-    input, user_buffer_size, cudf::get_default_stream(), nullptr, temp_mr);
+    input, user_buffer_size, cudf::get_default_stream(), std::nullopt, temp_mr);
 }
 
 // required for the unique_ptr to work with a incomplete type (contiguous_split_state)
@@ -2115,7 +2118,7 @@ std::unique_ptr<std::vector<uint8_t>> chunked_pack::build_metadata() const
 
 std::unique_ptr<chunked_pack> chunked_pack::create(cudf::table_view const& input,
                                                    std::size_t user_buffer_size,
-                                                   rmm::mr::device_memory_resource* temp_mr)
+                                                   rmm::device_async_resource_ref temp_mr)
 {
   return std::make_unique<chunked_pack>(input, user_buffer_size, temp_mr);
 }
diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp
index cb7d507de81..98ee6aa8f68 100644
--- a/cpp/src/copying/copy.cpp
+++ b/cpp/src/copying/copy.cpp
@@ -26,6 +26,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -119,7 +120,7 @@ std::unique_ptr<column> allocate_like(column_view const& input,
                                       size_type size,
                                       mask_allocation_policy mask_alloc,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(
@@ -177,7 +178,7 @@ std::unique_ptr<table> empty_like(table_view const& input_table)
 std::unique_ptr<column> allocate_like(column_view const& input,
                                       mask_allocation_policy mask_alloc,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::allocate_like(input, input.size(), mask_alloc, stream, mr);
@@ -187,7 +188,7 @@ std::unique_ptr<column> allocate_like(column_view const& input,
                                       size_type size,
                                       mask_allocation_policy mask_alloc,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::allocate_like(input, size, mask_alloc, stream, mr);
diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
index 8299c211fad..92fb2e61741 100644
--- a/cpp/src/copying/copy.cu
+++ b/cpp/src/copying/copy.cu
@@ -30,6 +30,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -77,7 +78,7 @@ struct copy_if_else_functor_impl<T, std::enable_if_t<is_rep_layout_compatible<T>
                                      bool right_nullable,
                                      Filter filter,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto p_lhs      = get_iterable_device_view{}(lhs_h, stream);
     auto p_rhs      = get_iterable_device_view{}(rhs_h, stream);
@@ -110,7 +111,7 @@ struct copy_if_else_functor_impl<string_view> {
                                      bool right_nullable,
                                      Filter filter,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     using T = string_view;
 
@@ -162,7 +163,7 @@ std::unique_ptr<column> scatter_gather_based_if_else(cudf::column_view const& lh
                                                      size_type size,
                                                      Filter is_left,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   auto gather_map = rmm::device_uvector<size_type>{static_cast<std::size_t>(size), stream};
   auto const gather_map_end = thrust::copy_if(rmm::exec_policy(stream),
@@ -196,7 +197,7 @@ std::unique_ptr<column> scatter_gather_based_if_else(cudf::scalar const& lhs,
                                                      size_type size,
                                                      Filter is_left,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   auto scatter_map = rmm::device_uvector<size_type>{static_cast<std::size_t>(size), stream};
   auto const scatter_map_end = thrust::copy_if(rmm::exec_policy(stream),
@@ -225,7 +226,7 @@ std::unique_ptr<column> scatter_gather_based_if_else(cudf::column_view const& lh
                                                      size_type size,
                                                      Filter is_left,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   return scatter_gather_based_if_else(rhs, lhs, size, logical_not{is_left}, stream, mr);
 }
@@ -236,7 +237,7 @@ std::unique_ptr<column> scatter_gather_based_if_else(cudf::scalar const& lhs,
                                                      size_type size,
                                                      Filter is_left,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   auto rhs_col = cudf::make_column_from_scalar(rhs, size, stream, mr);
   return scatter_gather_based_if_else(lhs, rhs_col->view(), size, is_left, stream, mr);
@@ -252,7 +253,7 @@ struct copy_if_else_functor_impl<struct_view> {
                                      bool,
                                      Filter filter,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return scatter_gather_based_if_else(lhs, rhs, size, filter, stream, mr);
   }
@@ -268,7 +269,7 @@ struct copy_if_else_functor_impl<list_view> {
                                      bool,
                                      Filter filter,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return scatter_gather_based_if_else(lhs, rhs, size, filter, stream, mr);
   }
@@ -284,7 +285,7 @@ struct copy_if_else_functor_impl<dictionary32> {
                                      bool,
                                      Filter filter,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return scatter_gather_based_if_else(lhs, rhs, size, filter, stream, mr);
   }
@@ -303,7 +304,7 @@ struct copy_if_else_functor {
                                      bool right_nullable,
                                      Filter filter,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     copy_if_else_functor_impl<T> copier{};
     return copier(lhs, rhs, size, left_nullable, right_nullable, filter, stream, mr);
@@ -318,7 +319,7 @@ std::unique_ptr<column> copy_if_else(Left const& lhs,
                                      bool right_nullable,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(boolean_mask.type() == data_type(type_id::BOOL8),
                "Boolean mask column must be of type type_id::BOOL8",
@@ -356,7 +357,7 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      column_view const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(boolean_mask.size() == lhs.size(),
                "Boolean mask column must be the same size as lhs and rhs columns",
@@ -372,7 +373,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      column_view const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(boolean_mask.size() == rhs.size(),
                "Boolean mask column must be the same size as rhs column",
@@ -390,7 +391,7 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      scalar const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(boolean_mask.size() == lhs.size(),
                "Boolean mask column must be the same size as lhs column",
@@ -408,7 +409,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      scalar const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(
     lhs.type() == rhs.type(), "Both inputs must be of the same type", cudf::data_type_error);
@@ -422,7 +423,7 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      column_view const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::copy_if_else(lhs, rhs, boolean_mask, stream, mr);
@@ -432,7 +433,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      column_view const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::copy_if_else(lhs, rhs, boolean_mask, stream, mr);
@@ -442,7 +443,7 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      scalar const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::copy_if_else(lhs, rhs, boolean_mask, stream, mr);
@@ -452,7 +453,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      scalar const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::copy_if_else(lhs, rhs, boolean_mask, stream, mr);
diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu
index e10d7081a55..d2ea7036952 100644
--- a/cpp/src/copying/copy_range.cu
+++ b/cpp/src/copying/copy_range.cu
@@ -34,6 +34,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 
@@ -98,7 +99,7 @@ struct out_of_place_copy_range_dispatch {
     cudf::size_type source_end,
     cudf::size_type target_begin,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
   {
     auto p_ret = std::make_unique<cudf::column>(target, stream, mr);
     if ((!p_ret->nullable()) && source.has_nulls(source_begin, source_end)) {
@@ -129,7 +130,7 @@ std::unique_ptr<cudf::column> out_of_place_copy_range_dispatch::operator()<cudf:
   cudf::size_type source_end,
   cudf::size_type target_begin,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   return cudf::strings::detail::copy_range(
     source, target, source_begin, source_end, target_begin, stream, mr);
@@ -141,7 +142,7 @@ std::unique_ptr<cudf::column> out_of_place_copy_range_dispatch::operator()<cudf:
   cudf::size_type source_end,
   cudf::size_type target_begin,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   // check the keys in the source and target
   cudf::dictionary_column_view const dict_source(source);
@@ -231,7 +232,7 @@ std::unique_ptr<column> copy_range(column_view const& source,
                                    size_type source_end,
                                    size_type target_begin,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS((source_begin >= 0) && (source_end <= source.size()) &&
                  (source_begin <= source_end) && (target_begin >= 0) &&
@@ -270,7 +271,7 @@ std::unique_ptr<column> copy_range(column_view const& source,
                                    size_type source_end,
                                    size_type target_begin,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::copy_range(source, target, source_begin, source_end, target_begin, stream, mr);
diff --git a/cpp/src/copying/gather.cu b/cpp/src/copying/gather.cu
index 78748e5a00b..5eb039419df 100644
--- a/cpp/src/copying/gather.cu
+++ b/cpp/src/copying/gather.cu
@@ -25,6 +25,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/transform_iterator.h>
@@ -39,7 +40,7 @@ std::unique_ptr<table> gather(table_view const& source_table,
                               out_of_bounds_policy bounds_policy,
                               negative_index_policy neg_indices,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(not gather_map.has_nulls(), "gather_map contains nulls", std::invalid_argument);
 
@@ -66,7 +67,7 @@ std::unique_ptr<table> gather(table_view const& source_table,
                               out_of_bounds_policy bounds_policy,
                               negative_index_policy neg_indices,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(gather_map.size() <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
                "gather map size exceeds the column size limit",
@@ -85,7 +86,7 @@ std::unique_ptr<table> gather(table_view const& source_table,
                               column_view const& gather_map,
                               out_of_bounds_policy bounds_policy,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu
index 2e804415439..b8860da479c 100644
--- a/cpp/src/copying/get_element.cu
+++ b/cpp/src/copying/get_element.cu
@@ -29,6 +29,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <stdexcept>
 
@@ -42,7 +43,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      size_type index,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto s = make_fixed_width_scalar(data_type(type_to_id<T>()), stream, mr);
 
@@ -65,7 +66,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      size_type index,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto device_col = column_device_view::create(input, stream);
 
@@ -89,7 +90,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      size_type index,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto dict_view    = dictionary_column_view(input);
     auto indices_iter = detail::indexalator_factory::make_input_iterator(dict_view.indices());
@@ -124,7 +125,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      size_type index,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     bool valid               = is_element_valid_sync(input, index, stream);
     auto const child_col_idx = lists_column_view::child_column_index;
@@ -148,7 +149,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      size_type index,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     using Type = typename T::rep;
 
@@ -178,7 +179,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      size_type index,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     bool valid = is_element_valid_sync(input, index, stream);
     auto row_contents =
@@ -193,7 +194,7 @@ struct get_element_functor {
 std::unique_ptr<scalar> get_element(column_view const& input,
                                     size_type index,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(index >= 0 and index < input.size(), "Index out of bounds", std::out_of_range);
   return type_dispatcher(input.type(), get_element_functor{}, input, index, stream, mr);
@@ -204,7 +205,7 @@ std::unique_ptr<scalar> get_element(column_view const& input,
 std::unique_ptr<scalar> get_element(column_view const& input,
                                     size_type index,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::get_element(input, index, stream, mr);
diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp
index e4de4a43b68..b0208a58896 100644
--- a/cpp/src/copying/pack.cpp
+++ b/cpp/src/copying/pack.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -144,7 +145,7 @@ void build_column_metadata(metadata_builder& mb,
  */
 packed_columns pack(cudf::table_view const& input,
                     rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr)
+                    rmm::device_async_resource_ref mr)
 {
   // do a contiguous_split with no splits to get the memory for the table
   // arranged as we want it
@@ -260,7 +261,7 @@ void metadata_builder::clear() { return impl->clear(); }
 /**
  * @copydoc cudf::pack
  */
-packed_columns pack(cudf::table_view const& input, rmm::mr::device_memory_resource* mr)
+packed_columns pack(cudf::table_view const& input, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::pack(input, cudf::get_default_stream(), mr);
diff --git a/cpp/src/copying/purge_nonempty_nulls.cu b/cpp/src/copying/purge_nonempty_nulls.cu
index 620a03d8be5..d69d214a881 100644
--- a/cpp/src/copying/purge_nonempty_nulls.cu
+++ b/cpp/src/copying/purge_nonempty_nulls.cu
@@ -18,6 +18,8 @@
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/count.h>
 #include <thrust/iterator/counting_iterator.h>
 
@@ -87,7 +89,7 @@ bool has_nonempty_nulls(cudf::column_view const& input, rmm::cuda_stream_view st
 
 std::unique_ptr<column> purge_nonempty_nulls(column_view const& input,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   // If not compound types (LIST/STRING/STRUCT/DICTIONARY) then just copy the input into output.
   if (!cudf::is_compound(input.type())) { return std::make_unique<column>(input, stream, mr); }
@@ -132,11 +134,11 @@ bool has_nonempty_nulls(column_view const& input, rmm::cuda_stream_view stream)
 }
 
 /**
- * @copydoc cudf::purge_nonempty_nulls(column_view const&, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::purge_nonempty_nulls(column_view const&, rmm::device_async_resource_ref)
  */
 std::unique_ptr<cudf::column> purge_nonempty_nulls(column_view const& input,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   return detail::purge_nonempty_nulls(input, stream, mr);
 }
diff --git a/cpp/src/copying/reverse.cu b/cpp/src/copying/reverse.cu
index 78d1b54882c..d3d42e35e26 100644
--- a/cpp/src/copying/reverse.cu
+++ b/cpp/src/copying/reverse.cu
@@ -26,6 +26,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/constant_iterator.h>
@@ -37,7 +38,7 @@ namespace cudf {
 namespace detail {
 std::unique_ptr<table> reverse(table_view const& source_table,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   size_type num_rows = source_table.num_rows();
   auto elements      = make_counting_transform_iterator(
@@ -51,7 +52,7 @@ std::unique_ptr<table> reverse(table_view const& source_table,
 
 std::unique_ptr<column> reverse(column_view const& source_column,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   return std::move(
     cudf::detail::reverse(table_view({source_column}), stream, mr)->release().front());
@@ -60,7 +61,7 @@ std::unique_ptr<column> reverse(column_view const& source_column,
 
 std::unique_ptr<table> reverse(table_view const& source_table,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::reverse(source_table, stream, mr);
@@ -68,7 +69,7 @@ std::unique_ptr<table> reverse(table_view const& source_table,
 
 std::unique_ptr<column> reverse(column_view const& source_column,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::reverse(source_column, stream, mr);
diff --git a/cpp/src/copying/sample.cu b/cpp/src/copying/sample.cu
index 0211f97deb3..f8e3a9a83e3 100644
--- a/cpp/src/copying/sample.cu
+++ b/cpp/src/copying/sample.cu
@@ -25,6 +25,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -40,7 +41,7 @@ std::unique_ptr<table> sample(table_view const& input,
                               sample_with_replacement replacement,
                               int64_t const seed,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(n >= 0, "expected number of samples should be non-negative");
   auto const num_rows = input.num_rows();
@@ -92,7 +93,7 @@ std::unique_ptr<table> sample(table_view const& input,
                               sample_with_replacement replacement,
                               int64_t const seed,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sample(input, n, replacement, seed, stream, mr);
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 3bc3979ec1b..cfcbe4724df 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -34,6 +34,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/count.h>
@@ -77,7 +78,7 @@ void scatter_scalar_bitmask_inplace(std::reference_wrapper<scalar const> const&
                                     size_type num_scatter_rows,
                                     column& target,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   constexpr size_type block_size = 256;
   size_type const grid_size      = grid_1d(num_scatter_rows, block_size).num_blocks;
@@ -109,7 +110,7 @@ struct column_scalar_scatterer_impl {
                                      size_type scatter_rows,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     CUDF_EXPECTS(source.get().type() == target.type(),
                  "scalar and column types must match",
@@ -142,7 +143,7 @@ struct column_scalar_scatterer_impl<string_view, MapIterator> {
                                      size_type scatter_rows,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     CUDF_EXPECTS(source.get().type() == target.type(),
                  "scalar and column types must match",
@@ -166,7 +167,7 @@ struct column_scalar_scatterer_impl<list_view, MapIterator> {
                                      size_type scatter_rows,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     CUDF_EXPECTS(source.get().type() == target.type(),
                  "scalar and column types must match",
@@ -186,7 +187,7 @@ struct column_scalar_scatterer_impl<dictionary32, MapIterator> {
                                      size_type scatter_rows,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     auto dict_target =
       dictionary::detail::add_keys(dictionary_column_view(target),
@@ -238,7 +239,7 @@ struct column_scalar_scatterer {
                                      size_type scatter_rows,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     column_scalar_scatterer_impl<Element, MapIterator> scatterer{};
     return scatterer(source, scatter_iter, scatter_rows, target, stream, mr);
@@ -252,7 +253,7 @@ struct column_scalar_scatterer_impl<struct_view, MapIterator> {
                                      size_type scatter_rows,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     CUDF_EXPECTS(source.get().type() == target.type(),
                  "scalar and column types must match",
@@ -306,7 +307,7 @@ std::unique_ptr<table> scatter(table_view const& source,
                                column_view const& scatter_map,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(source.num_columns() == target.num_columns(),
                "Number of columns in source and target not equal",
@@ -336,7 +337,7 @@ std::unique_ptr<table> scatter(table_view const& source,
                                device_span<size_type const> const scatter_map,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(scatter_map.size() <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
                "scatter map size exceeds the column size limit",
@@ -353,7 +354,7 @@ std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<scalar const>>
                                column_view const& indices,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(source.size() == static_cast<size_t>(target.num_columns()),
                "Number of scalars in source and number of columns in target not equal",
@@ -405,7 +406,7 @@ std::unique_ptr<column> boolean_mask_scatter(column_view const& input,
                                              column_view const& target,
                                              column_view const& boolean_mask,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   auto indices = cudf::make_numeric_column(
     data_type{type_id::INT32}, target.size(), mask_state::UNALLOCATED, stream);
@@ -430,7 +431,7 @@ std::unique_ptr<column> boolean_mask_scatter(scalar const& input,
                                              column_view const& target,
                                              column_view const& boolean_mask,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   return detail::copy_if_else(input, target, boolean_mask, stream, mr);
 }
@@ -439,7 +440,7 @@ std::unique_ptr<table> boolean_mask_scatter(table_view const& input,
                                             table_view const& target,
                                             column_view const& boolean_mask,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.num_columns() == target.num_columns(),
                "Mismatch in number of input columns and target columns",
@@ -482,7 +483,7 @@ std::unique_ptr<table> boolean_mask_scatter(
   table_view const& target,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(static_cast<size_type>(input.size()) == target.num_columns(),
                "Mismatch in number of scalars and target columns",
@@ -527,7 +528,7 @@ std::unique_ptr<table> scatter(table_view const& source,
                                column_view const& scatter_map,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::scatter(source, scatter_map, target, stream, mr);
@@ -537,7 +538,7 @@ std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<scalar const>>
                                column_view const& indices,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::scatter(source, indices, target, stream, mr);
@@ -547,7 +548,7 @@ std::unique_ptr<table> boolean_mask_scatter(table_view const& input,
                                             table_view const& target,
                                             column_view const& boolean_mask,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::boolean_mask_scatter(input, target, boolean_mask, stream, mr);
@@ -558,7 +559,7 @@ std::unique_ptr<table> boolean_mask_scatter(
   table_view const& target,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::boolean_mask_scatter(input, target, boolean_mask, stream, mr);
diff --git a/cpp/src/copying/segmented_shift.cu b/cpp/src/copying/segmented_shift.cu
index dd2733cf7e9..b7abc60f240 100644
--- a/cpp/src/copying/segmented_shift.cu
+++ b/cpp/src/copying/segmented_shift.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
@@ -73,7 +74,7 @@ struct segmented_shift_functor<T, std::enable_if_t<is_rep_layout_compatible<T>()
                                      size_type offset,
                                      scalar const& fill_value,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto values_device_view = column_device_view::create(segmented_values, stream);
     bool nullable           = not fill_value.is_valid(stream) or segmented_values.nullable();
@@ -102,7 +103,7 @@ struct segmented_shift_functor<string_view> {
                                      size_type offset,
                                      scalar const& fill_value,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto values_device_view = column_device_view::create(segmented_values, stream);
     auto input_iterator     = make_optional_iterator<cudf::string_view>(
@@ -129,7 +130,7 @@ struct segmented_shift_functor_forwarder {
                                      size_type offset,
                                      scalar const& fill_value,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     segmented_shift_functor<T> shifter;
     return shifter(segmented_values, segment_offsets, offset, fill_value, stream, mr);
@@ -143,7 +144,7 @@ std::unique_ptr<column> segmented_shift(column_view const& segmented_values,
                                         size_type offset,
                                         scalar const& fill_value,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   if (segmented_values.is_empty()) { return empty_like(segmented_values); }
   if (offset == 0) { return std::make_unique<column>(segmented_values, stream, mr); };
diff --git a/cpp/src/copying/shift.cu b/cpp/src/copying/shift.cu
index 8e013bb1212..bdc741887f7 100644
--- a/cpp/src/copying/shift.cu
+++ b/cpp/src/copying/shift.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -51,7 +52,7 @@ std::pair<rmm::device_buffer, size_type> create_null_mask(column_device_view con
                                                           size_type offset,
                                                           scalar const& fill_value,
                                                           rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource* mr)
+                                                          rmm::device_async_resource_ref mr)
 {
   auto const size = input.size();
   auto func_validity =
@@ -81,7 +82,7 @@ struct shift_functor {
     size_type offset,
     scalar const& fill_value,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     auto output = cudf::strings::detail::shift(
       cudf::strings_column_view(input), offset, fill_value, stream, mr);
@@ -101,7 +102,7 @@ struct shift_functor {
     size_type offset,
     scalar const& fill_value,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     using ScalarType = cudf::scalar_type_t<T>;
     auto& scalar     = static_cast<ScalarType const&>(fill_value);
@@ -155,7 +156,7 @@ std::unique_ptr<column> shift(column_view const& input,
                               size_type offset,
                               scalar const& fill_value,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type() == fill_value.type(),
                "shift requires each fill value type to match the corresponding column type.",
@@ -173,7 +174,7 @@ std::unique_ptr<column> shift(column_view const& input,
                               size_type offset,
                               scalar const& fill_value,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::shift(input, offset, fill_value, stream, mr);
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index 371663c41ee..7629cad79a9 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,6 +35,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/permutation_iterator.h>
@@ -254,7 +255,7 @@ struct dispatch_round {
     rounding_frequency component,
     cudf::column_view const& column,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
+    rmm::device_async_resource_ref mr) const
   {
     auto size            = column.size();
     auto output_col_type = data_type{cudf::type_to_id<Timestamp>()};
@@ -319,7 +320,7 @@ struct launch_functor {
 template <typename TransformFunctor, cudf::type_id OutputColCudfT>
 std::unique_ptr<column> apply_datetime_op(column_view const& column,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_timestamp(column.type()), "Column type should be timestamp");
   auto size            = column.size();
@@ -355,7 +356,7 @@ struct add_calendrical_months_functor {
     column_view timestamp_column,
     MonthIterator months_begin,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
+    rmm::device_async_resource_ref mr) const
   {
     auto size            = timestamp_column.size();
     auto output_col_type = timestamp_column.type();
@@ -386,7 +387,7 @@ struct add_calendrical_months_functor {
 std::unique_ptr<column> add_calendrical_months(column_view const& timestamp_column,
                                                column_view const& months_column,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_timestamp(timestamp_column.type()), "Column type should be timestamp");
   CUDF_EXPECTS(
@@ -413,7 +414,7 @@ std::unique_ptr<column> add_calendrical_months(column_view const& timestamp_colu
 std::unique_ptr<column> add_calendrical_months(column_view const& timestamp_column,
                                                scalar const& months,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_timestamp(timestamp_column.type()), "Column type should be timestamp");
   CUDF_EXPECTS(months.type().id() == type_id::INT16 or months.type().id() == type_id::INT32,
@@ -442,7 +443,7 @@ std::unique_ptr<column> round_general(rounding_function round_kind,
                                       rounding_frequency component,
                                       column_view const& column,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   return cudf::type_dispatcher(
     column.type(), dispatch_round{}, round_kind, component, column, stream, mr);
@@ -450,7 +451,7 @@ std::unique_ptr<column> round_general(rounding_function round_kind,
 
 std::unique_ptr<column> extract_year(column_view const& column,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::YEAR>,
@@ -459,7 +460,7 @@ std::unique_ptr<column> extract_year(column_view const& column,
 
 std::unique_ptr<column> extract_month(column_view const& column,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::MONTH>,
@@ -468,7 +469,7 @@ std::unique_ptr<column> extract_month(column_view const& column,
 
 std::unique_ptr<column> extract_day(column_view const& column,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::DAY>,
@@ -477,7 +478,7 @@ std::unique_ptr<column> extract_day(column_view const& column,
 
 std::unique_ptr<column> extract_weekday(column_view const& column,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::WEEKDAY>,
@@ -486,7 +487,7 @@ std::unique_ptr<column> extract_weekday(column_view const& column,
 
 std::unique_ptr<column> extract_hour(column_view const& column,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::HOUR>,
@@ -495,7 +496,7 @@ std::unique_ptr<column> extract_hour(column_view const& column,
 
 std::unique_ptr<column> extract_minute(column_view const& column,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::MINUTE>,
@@ -504,7 +505,7 @@ std::unique_ptr<column> extract_minute(column_view const& column,
 
 std::unique_ptr<column> extract_second(column_view const& column,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::SECOND>,
@@ -513,7 +514,7 @@ std::unique_ptr<column> extract_second(column_view const& column,
 
 std::unique_ptr<column> extract_millisecond_fraction(column_view const& column,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::MILLISECOND>,
@@ -522,7 +523,7 @@ std::unique_ptr<column> extract_millisecond_fraction(column_view const& column,
 
 std::unique_ptr<column> extract_microsecond_fraction(column_view const& column,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::MICROSECOND>,
@@ -531,7 +532,7 @@ std::unique_ptr<column> extract_microsecond_fraction(column_view const& column,
 
 std::unique_ptr<column> extract_nanosecond_fraction(column_view const& column,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::NANOSECOND>,
@@ -540,7 +541,7 @@ std::unique_ptr<column> extract_nanosecond_fraction(column_view const& column,
 
 std::unique_ptr<column> last_day_of_month(column_view const& column,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<detail::extract_last_day_of_month,
                                    cudf::type_id::TIMESTAMP_DAYS>(column, stream, mr);
@@ -548,7 +549,7 @@ std::unique_ptr<column> last_day_of_month(column_view const& column,
 
 std::unique_ptr<column> day_of_year(column_view const& column,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<detail::extract_day_num_of_year, cudf::type_id::INT16>(
     column, stream, mr);
@@ -556,21 +557,21 @@ std::unique_ptr<column> day_of_year(column_view const& column,
 
 std::unique_ptr<column> is_leap_year(column_view const& column,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   return apply_datetime_op<is_leap_year_op, type_id::BOOL8>(column, stream, mr);
 }
 
 std::unique_ptr<column> days_in_month(column_view const& column,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   return apply_datetime_op<days_in_month_op, type_id::INT16>(column, stream, mr);
 }
 
 std::unique_ptr<column> extract_quarter(column_view const& column,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   return apply_datetime_op<extract_quarter_op, type_id::INT16>(column, stream, mr);
 }
@@ -579,7 +580,7 @@ std::unique_ptr<column> extract_quarter(column_view const& column,
 
 std::unique_ptr<column> ceil_datetimes(column_view const& column,
                                        rounding_frequency freq,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::round_general(
@@ -588,7 +589,7 @@ std::unique_ptr<column> ceil_datetimes(column_view const& column,
 
 std::unique_ptr<column> floor_datetimes(column_view const& column,
                                         rounding_frequency freq,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::round_general(
@@ -597,88 +598,85 @@ std::unique_ptr<column> floor_datetimes(column_view const& column,
 
 std::unique_ptr<column> round_datetimes(column_view const& column,
                                         rounding_frequency freq,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::round_general(
     detail::rounding_function::ROUND, freq, column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> extract_year(column_view const& column, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> extract_year(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_year(column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> extract_month(column_view const& column,
-                                      rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> extract_month(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_month(column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> extract_day(column_view const& column, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> extract_day(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_day(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> extract_weekday(column_view const& column,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_weekday(column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> extract_hour(column_view const& column, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> extract_hour(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_hour(column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> extract_minute(column_view const& column,
-                                       rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> extract_minute(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_minute(column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> extract_second(column_view const& column,
-                                       rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> extract_second(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_second(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> extract_millisecond_fraction(column_view const& column,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_millisecond_fraction(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> extract_microsecond_fraction(column_view const& column,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_microsecond_fraction(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> extract_nanosecond_fraction(column_view const& column,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_nanosecond_fraction(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> last_day_of_month(column_view const& column,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::last_day_of_month(column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> day_of_year(column_view const& column, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> day_of_year(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::day_of_year(column, cudf::get_default_stream(), mr);
@@ -686,7 +684,7 @@ std::unique_ptr<column> day_of_year(column_view const& column, rmm::mr::device_m
 
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamp_column,
                                                      cudf::column_view const& months_column,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::add_calendrical_months(
@@ -695,27 +693,26 @@ std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& ti
 
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamp_column,
                                                      cudf::scalar const& months,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::add_calendrical_months(timestamp_column, months, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> is_leap_year(column_view const& column, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> is_leap_year(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_leap_year(column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> days_in_month(column_view const& column,
-                                      rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> days_in_month(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::days_in_month(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> extract_quarter(column_view const& column,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_quarter(column, cudf::get_default_stream(), mr);
diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp
index a75eea7172f..a3471485293 100644
--- a/cpp/src/datetime/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -19,6 +19,8 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <algorithm>
 #include <filesystem>
 #include <fstream>
@@ -379,7 +381,7 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year)
 
 std::unique_ptr<table> make_timezone_transition_table(std::optional<std::string_view> tzif_dir,
                                                       std::string_view timezone_name,
-                                                      rmm::mr::device_memory_resource* mr)
+                                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::make_timezone_transition_table(
@@ -391,7 +393,7 @@ namespace detail {
 std::unique_ptr<table> make_timezone_transition_table(std::optional<std::string_view> tzif_dir,
                                                       std::string_view timezone_name,
                                                       rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr)
+                                                      rmm::device_async_resource_ref mr)
 {
   if (timezone_name == "UTC" || timezone_name.empty()) {
     // Return an empty table for UTC
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index 3973100aced..5fd21ee0094 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,6 +31,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -49,7 +50,7 @@ namespace detail {
 std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& new_keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!new_keys.has_nulls(), "Keys must not have nulls");
   auto old_keys = dictionary_column.keys();  // [a,b,c,d,f]
@@ -131,7 +132,7 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
 std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::add_keys(dictionary_column, keys, stream, mr);
diff --git a/cpp/src/dictionary/decode.cu b/cpp/src/dictionary/decode.cu
index 8ce741c4a91..9f05593fc40 100644
--- a/cpp/src/dictionary/decode.cu
+++ b/cpp/src/dictionary/decode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -46,7 +47,7 @@ struct indices_handler_fn {
  */
 std::unique_ptr<column> decode(dictionary_column_view const& source,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   if (source.is_empty()) return make_empty_column(type_id::EMPTY);
 
@@ -77,7 +78,7 @@ std::unique_ptr<column> decode(dictionary_column_view const& source,
 
 std::unique_ptr<column> decode(dictionary_column_view const& source,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::decode(source, stream, mr);
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index 17295fb0345..62a6c816493 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -31,6 +31,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -140,7 +141,7 @@ struct dispatch_compute_indices {
              offsets_pair const* d_offsets,
              size_type const* d_map_to_keys,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
+             rmm::device_async_resource_ref mr)
   {
     auto keys_view     = column_device_view::create(all_keys, stream);
     auto indices_view  = column_device_view::create(all_indices, stream);
@@ -206,7 +207,7 @@ struct dispatch_compute_indices {
 
 std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   // exception here is the same behavior as in cudf::concatenate
   CUDF_EXPECTS(not columns.empty(), "Unexpected empty list of columns to concatenate.");
diff --git a/cpp/src/dictionary/detail/merge.cu b/cpp/src/dictionary/detail/merge.cu
index 2fe21680873..c65aa5d1101 100644
--- a/cpp/src/dictionary/detail/merge.cu
+++ b/cpp/src/dictionary/detail/merge.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
@@ -36,7 +37,7 @@ std::unique_ptr<column> merge(dictionary_column_view const& lcol,
                               dictionary_column_view const& rcol,
                               cudf::detail::index_vector const& row_order,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   auto const lcol_iter = cudf::detail::indexalator_factory::make_input_iterator(lcol.indices());
   auto const rcol_iter = cudf::detail::indexalator_factory::make_input_iterator(rcol.indices());
diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu
index f70423a13a9..37f8fa7a05b 100644
--- a/cpp/src/dictionary/dictionary_factories.cu
+++ b/cpp/src/dictionary/dictionary_factories.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace {
@@ -30,7 +31,7 @@ struct dispatch_create_indices {
   template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& indices,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     CUDF_EXPECTS(std::is_unsigned<IndexType>(), "indices must be an unsigned type");
     column_view indices_view{
@@ -40,7 +41,7 @@ struct dispatch_create_indices {
   template <typename IndexType, std::enable_if_t<!is_index_type<IndexType>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("indices must be an integer type.");
   }
@@ -50,7 +51,7 @@ struct dispatch_create_indices {
 std::unique_ptr<column> make_dictionary_column(column_view const& keys_column,
                                                column_view const& indices_column,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!keys_column.has_nulls(), "keys column must not have nulls");
   if (keys_column.is_empty()) return make_empty_column(type_id::DICTIONARY32);
@@ -117,7 +118,7 @@ struct make_unsigned_fn {
 std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys,
                                                std::unique_ptr<column> indices,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!keys->has_nulls(), "keys column must not have nulls");
 
diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu
index c92b57f0cac..ff29d83b80a 100644
--- a/cpp/src/dictionary/encode.cu
+++ b/cpp/src/dictionary/encode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -41,7 +42,7 @@ namespace detail {
 std::unique_ptr<column> encode(column_view const& input_column,
                                data_type indices_type,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_unsigned(indices_type), "indices must be type unsigned integer");
   CUDF_EXPECTS(input_column.type().id() != type_id::DICTIONARY32,
@@ -90,7 +91,7 @@ data_type get_indices_type_for_size(size_type keys_size)
 std::unique_ptr<column> encode(column_view const& input_column,
                                data_type indices_type,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::encode(input_column, indices_type, stream, mr);
diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu
index 86b70f1119b..718ca419289 100644
--- a/cpp/src/dictionary/remove_keys.cu
+++ b/cpp/src/dictionary/remove_keys.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -59,7 +60,7 @@ template <typename KeysKeeper>
 std::unique_ptr<column> remove_keys_fn(dictionary_column_view const& dictionary_column,
                                        KeysKeeper keys_to_keep_fn,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto const keys_view    = dictionary_column.keys();
   auto const indices_type = dictionary_column.indices().type();
@@ -150,7 +151,7 @@ std::unique_ptr<column> remove_keys_fn(dictionary_column_view const& dictionary_
 std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_column,
                                     column_view const& keys_to_remove,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!keys_to_remove.has_nulls(), "keys_to_remove must not have nulls");
   auto const keys_view = dictionary_column.keys();
@@ -166,7 +167,7 @@ std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_col
 
 std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& dictionary_column,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   // locate the keys to remove
   auto const keys_size     = dictionary_column.keys_size();
@@ -196,7 +197,7 @@ std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& diction
 std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_column,
                                     column_view const& keys_to_remove,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::remove_keys(dictionary_column, keys_to_remove, stream, mr);
@@ -204,7 +205,7 @@ std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_col
 
 std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& dictionary_column,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::remove_unused_keys(dictionary_column, stream, mr);
diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu
index 7069993866c..bb6b08c243d 100644
--- a/cpp/src/dictionary/replace.cu
+++ b/cpp/src/dictionary/replace.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <cudf/dictionary/dictionary_factories.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -52,7 +53,7 @@ template <typename ReplacementIter>
 std::unique_ptr<column> replace_indices(column_view const& input,
                                         ReplacementIter replacement_iter,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   auto const input_view = column_device_view::create(input, stream);
   auto const d_input    = *input_view;
@@ -74,12 +75,12 @@ std::unique_ptr<column> replace_indices(column_view const& input,
 
 /**
  * @copydoc cudf::dictionary::detail::replace_nulls(cudf::column_view const&,cudf::column_view
- * const& rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+ * const& rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
 std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
                                       dictionary_column_view const& replacement,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return cudf::empty_like(input.parent()); }
   if (!input.has_nulls()) { return std::make_unique<cudf::column>(input.parent(), stream, mr); }
@@ -107,12 +108,12 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
 
 /**
  * @copydoc cudf::dictionary::detail::replace_nulls(cudf::column_view const&,cudf::scalar
- * const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+ * const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
 std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
                                       scalar const& replacement,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return cudf::empty_like(input.parent()); }
   if (!input.has_nulls() || !replacement.is_valid(stream)) {
diff --git a/cpp/src/dictionary/search.cu b/cpp/src/dictionary/search.cu
index e35aded1984..680eadddba8 100644
--- a/cpp/src/dictionary/search.cu
+++ b/cpp/src/dictionary/search.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -40,7 +41,7 @@ struct dispatch_scalar_index {
   std::unique_ptr<scalar> operator()(size_type index,
                                      bool is_valid,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return std::make_unique<numeric_scalar<IndexType>>(index, is_valid, stream, mr);
   }
@@ -69,7 +70,7 @@ struct find_index_fn {
   std::unique_ptr<scalar> operator()(dictionary_column_view const& input,
                                      scalar const& key,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     if (!key.is_valid(stream))
       return type_dispatcher(input.indices().type(), dispatch_scalar_index{}, 0, false, stream, mr);
@@ -96,7 +97,7 @@ struct find_index_fn {
   std::unique_ptr<scalar> operator()(dictionary_column_view const&,
                                      scalar const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL(
       "dictionary, list_view, and struct_view columns cannot be the keys column of a dictionary");
@@ -111,7 +112,7 @@ struct find_insert_index_fn {
   std::unique_ptr<scalar> operator()(dictionary_column_view const& input,
                                      scalar const& key,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     if (!key.is_valid(stream))
       return type_dispatcher(input.indices().type(), dispatch_scalar_index{}, 0, false, stream, mr);
@@ -138,7 +139,7 @@ struct find_insert_index_fn {
   std::unique_ptr<scalar> operator()(dictionary_column_view const&,
                                      scalar const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("dictionary, list_view, and struct_view columns cannot be the keys for a dictionary");
   }
@@ -149,7 +150,7 @@ struct find_insert_index_fn {
 std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
                                   scalar const& key,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   if (dictionary.is_empty())
     return std::make_unique<numeric_scalar<uint32_t>>(0, false, stream, mr);
@@ -160,7 +161,7 @@ std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
 std::unique_ptr<scalar> get_insert_index(dictionary_column_view const& dictionary,
                                          scalar const& key,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   if (dictionary.is_empty())
     return std::make_unique<numeric_scalar<uint32_t>>(0, false, stream, mr);
@@ -175,7 +176,7 @@ std::unique_ptr<scalar> get_insert_index(dictionary_column_view const& dictionar
 std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
                                   scalar const& key,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::get_index(dictionary, key, stream, mr);
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index b49cf7850b1..b56eec9401a 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,6 +32,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -61,7 +62,7 @@ struct dispatch_compute_indices {
   operator()(dictionary_column_view const& input,
              column_view const& new_keys,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
+             rmm::device_async_resource_ref mr)
   {
     auto dictionary_view = column_device_view::create(input.parent(), stream);
     auto dictionary_itr  = make_dictionary_iterator<Element>(*dictionary_view);
@@ -119,7 +120,7 @@ struct dispatch_compute_indices {
 std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& new_keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!new_keys.has_nulls(), "keys parameter must not have nulls");
   auto keys = dictionary_column.keys();
@@ -177,7 +178,7 @@ std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column
 std::vector<std::unique_ptr<column>> match_dictionaries(
   cudf::host_span<dictionary_column_view const> input,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   std::vector<column_view> keys(input.size());
   std::transform(input.begin(), input.end(), keys.begin(), [](auto& col) { return col.keys(); });
@@ -191,7 +192,7 @@ std::vector<std::unique_ptr<column>> match_dictionaries(
 }
 
 std::pair<std::vector<std::unique_ptr<column>>, std::vector<table_view>> match_dictionaries(
-  std::vector<table_view> tables, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  std::vector<table_view> tables, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   // Make a copy of all the column views from each table_view
   std::vector<std::vector<column_view>> updated_columns;
@@ -242,7 +243,7 @@ std::pair<std::vector<std::unique_ptr<column>>, std::vector<table_view>> match_d
 std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::set_keys(dictionary_column, keys, stream, mr);
@@ -251,7 +252,7 @@ std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column
 std::vector<std::unique_ptr<column>> match_dictionaries(
   cudf::host_span<dictionary_column_view const> input,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::match_dictionaries(input, stream, mr);
diff --git a/cpp/src/filling/calendrical_month_sequence.cu b/cpp/src/filling/calendrical_month_sequence.cu
index 80badb7d566..3e6d693dde5 100644
--- a/cpp/src/filling/calendrical_month_sequence.cu
+++ b/cpp/src/filling/calendrical_month_sequence.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -30,7 +31,7 @@ std::unique_ptr<cudf::column> calendrical_month_sequence(size_type size,
                                                          scalar const& init,
                                                          size_type months,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr)
+                                                         rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(
     init.type(), calendrical_month_sequence_functor{}, size, init, months, stream, mr);
@@ -41,7 +42,7 @@ std::unique_ptr<cudf::column> calendrical_month_sequence(size_type size,
                                                          scalar const& init,
                                                          size_type months,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr)
+                                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::calendrical_month_sequence(size, init, months, stream, mr);
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index 42d1f7592ec..c4d786bd73b 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,6 +36,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 
@@ -107,7 +108,7 @@ struct out_of_place_fill_range_dispatch {
   std::unique_ptr<cudf::column> operator()(cudf::size_type begin,
                                            cudf::size_type end,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     CUDF_EXPECTS(input.type() == value.type(), "Data type mismatch.");
     auto p_ret = std::make_unique<cudf::column>(input, stream, mr);
@@ -134,7 +135,7 @@ std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf:
   cudf::size_type begin,
   cudf::size_type end,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type() == value.type(), "Data type mismatch.");
   using ScalarType = cudf::scalar_type_t<cudf::string_view>;
@@ -148,7 +149,7 @@ std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf:
   cudf::size_type begin,
   cudf::size_type end,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return std::make_unique<cudf::column>(input, stream, mr);
   cudf::dictionary_column_view const target(input);
@@ -233,7 +234,7 @@ std::unique_ptr<column> fill(column_view const& input,
                              size_type end,
                              scalar const& value,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS((begin >= 0) && (end <= input.size()) && (begin <= end), "Range is out of bounds.");
 
@@ -258,7 +259,7 @@ std::unique_ptr<column> fill(column_view const& input,
                              size_type end,
                              scalar const& value,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::fill(input, begin, end, value, stream, mr);
diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu
index 87cc0f21d0e..ff4005d9366 100644
--- a/cpp/src/filling/repeat.cu
+++ b/cpp/src/filling/repeat.cu
@@ -33,6 +33,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -102,7 +103,7 @@ namespace detail {
 std::unique_ptr<table> repeat(table_view const& input_table,
                               column_view const& count,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input_table.num_rows() == count.size(), "in and count must have equal size");
   CUDF_EXPECTS(not count.has_nulls(), "count cannot contain nulls");
@@ -131,7 +132,7 @@ std::unique_ptr<table> repeat(table_view const& input_table,
 std::unique_ptr<table> repeat(table_view const& input_table,
                               size_type count,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   if ((input_table.num_rows() == 0) || (count == 0)) { return cudf::empty_like(input_table); }
 
@@ -154,7 +155,7 @@ std::unique_ptr<table> repeat(table_view const& input_table,
 std::unique_ptr<table> repeat(table_view const& input_table,
                               column_view const& count,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::repeat(input_table, count, stream, mr);
@@ -163,7 +164,7 @@ std::unique_ptr<table> repeat(table_view const& input_table,
 std::unique_ptr<table> repeat(table_view const& input_table,
                               size_type count,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::repeat(input_table, count, stream, mr);
diff --git a/cpp/src/filling/sequence.cu b/cpp/src/filling/sequence.cu
index 99a17f8b0e0..f7067c3a91b 100644
--- a/cpp/src/filling/sequence.cu
+++ b/cpp/src/filling/sequence.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/sequence.h>
 #include <thrust/tabulate.h>
@@ -66,7 +67,7 @@ struct sequence_functor {
                                      scalar const& init,
                                      scalar const& step,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto result = make_fixed_width_column(init.type(), size, mask_state::UNALLOCATED, stream, mr);
     auto result_device_view = mutable_column_device_view::create(*result, stream);
@@ -92,7 +93,7 @@ struct sequence_functor {
   std::unique_ptr<column> operator()(size_type size,
                                      scalar const& init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto result = make_fixed_width_column(init.type(), size, mask_state::UNALLOCATED, stream, mr);
     auto result_device_view = mutable_column_device_view::create(*result, stream);
@@ -125,7 +126,7 @@ std::unique_ptr<column> sequence(size_type size,
                                  scalar const& init,
                                  scalar const& step,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(init.type() == step.type(), "init and step must be of the same type.");
   CUDF_EXPECTS(size >= 0, "size must be >= 0");
@@ -137,7 +138,7 @@ std::unique_ptr<column> sequence(size_type size,
 std::unique_ptr<column> sequence(size_type size,
                                  scalar const& init,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(size >= 0, "size must be >= 0");
   CUDF_EXPECTS(is_numeric(init.type()), "init scalar type must be numeric");
@@ -151,7 +152,7 @@ std::unique_ptr<column> sequence(size_type size,
                                  scalar const& init,
                                  scalar const& step,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sequence(size, init, step, stream, mr);
@@ -160,7 +161,7 @@ std::unique_ptr<column> sequence(size_type size,
 std::unique_ptr<column> sequence(size_type size,
                                  scalar const& init,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sequence(size, init, stream, mr);
diff --git a/cpp/src/groupby/common/utils.hpp b/cpp/src/groupby/common/utils.hpp
index 09b85c74f08..82c3c08b501 100644
--- a/cpp/src/groupby/common/utils.hpp
+++ b/cpp/src/groupby/common/utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 #include <cudf/detail/groupby.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <memory>
 #include <vector>
 
@@ -31,7 +33,7 @@ template <typename RequestType>
 inline std::vector<aggregation_result> extract_results(host_span<RequestType const> requests,
                                                        cudf::detail::result_cache& cache,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   std::vector<aggregation_result> results(requests.size());
   std::unordered_map<std::pair<column_view, std::reference_wrapper<aggregation const>>,
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index e3c021eb66a..73cb4efd283 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,6 +39,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -65,7 +66,7 @@ groupby::groupby(table_view const& keys,
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::dispatch_aggregation(
   host_span<aggregation_request const> requests,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   // If sort groupby has been called once on this groupby object, then
   // always use sort groupby from now on. Because once keys are sorted,
@@ -193,7 +194,7 @@ void verify_valid_requests(host_span<RequestType const> requests)
 
 // Compute aggregation requests
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggregate(
-  host_span<aggregation_request const> requests, rmm::mr::device_memory_resource* mr)
+  host_span<aggregation_request const> requests, rmm::device_async_resource_ref mr)
 {
   return aggregate(requests, cudf::get_default_stream(), mr);
 }
@@ -202,7 +203,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggr
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggregate(
   host_span<aggregation_request const> requests,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(
@@ -220,7 +221,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggr
 
 // Compute scan requests
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::scan(
-  host_span<scan_request const> requests, rmm::mr::device_memory_resource* mr)
+  host_span<scan_request const> requests, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(
@@ -236,7 +237,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::scan
   return sort_scan(requests, cudf::get_default_stream(), mr);
 }
 
-groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_resource* mr)
+groupby::groups groupby::get_groups(table_view values, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto const stream = cudf::get_default_stream();
@@ -262,7 +263,7 @@ groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_re
 std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::replace_nulls(
   table_view const& values,
   host_span<cudf::replace_policy const> replace_policies,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(_keys.num_rows() == values.num_rows(),
@@ -306,7 +307,7 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::shift(
   table_view const& values,
   host_span<size_type const> offsets,
   std::vector<std::reference_wrapper<scalar const>> const& fill_values,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(values.num_columns() == static_cast<size_type>(fill_values.size()),
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index acc1b087510..4f75ab19c66 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -44,6 +44,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_set.cuh>
 #include <thrust/for_each.h>
@@ -190,7 +191,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
   SetType set;
   bitmask_type const* __restrict__ row_bitmask;
   rmm::cuda_stream_view stream;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 
  public:
   using cudf::detail::aggregation_finalizer::visit;
@@ -202,7 +203,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
                               SetType set,
                               bitmask_type const* row_bitmask,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
     : col(col),
       sparse_results(sparse_results),
       dense_results(dense_results),
@@ -398,7 +399,7 @@ void sparse_to_dense_results(table_view const& keys,
                              bool keys_have_nulls,
                              null_policy include_null_keys,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   auto row_bitmask =
     cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first;
@@ -551,7 +552,7 @@ std::unique_ptr<table> groupby(table_view const& keys,
                                bool const keys_have_nulls,
                                null_policy const include_null_keys,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   auto const num_keys            = keys.num_rows();
   auto const null_keys_are_equal = null_equality::EQUAL;
@@ -654,7 +655,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
   host_span<aggregation_request const> requests,
   null_policy include_null_keys,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   cudf::detail::result_cache cache(requests.size());
 
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 2d6f99de25a..ba59616babe 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -37,6 +37,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <unordered_map>
@@ -797,7 +798,7 @@ void aggregate_result_functor::operator()<aggregation::MERGE_TDIGEST>(aggregatio
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort_aggregate(
   host_span<aggregation_request const> requests,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   // We're going to start by creating a cache of results so that aggs that
   // depend on other aggs will not have to be recalculated. e.g. mean depends on
diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp
index be36956b929..057085fe85d 100644
--- a/cpp/src/groupby/sort/functors.hpp
+++ b/cpp/src/groupby/sort/functors.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -42,7 +43,7 @@ struct store_result_functor {
                        sort::sort_groupby_helper& helper,
                        cudf::detail::result_cache& cache,
                        rmm::cuda_stream_view stream,
-                       rmm::mr::device_memory_resource* mr,
+                       rmm::device_async_resource_ref mr,
                        sorted keys_are_sorted = sorted::NO)
     : helper(helper),
       cache(cache),
@@ -98,8 +99,8 @@ struct store_result_functor {
   cudf::detail::result_cache& cache;  ///< cache of results to store into
   column_view const& values;          ///< Column of values to group and aggregate
 
-  rmm::cuda_stream_view stream;         ///< CUDA stream on which to execute kernels
-  rmm::mr::device_memory_resource* mr;  ///< Memory resource to allocate space for results
+  rmm::cuda_stream_view stream;       ///< CUDA stream on which to execute kernels
+  rmm::device_async_resource_ref mr;  ///< Memory resource to allocate space for results
 
   sorted keys_are_sorted;                  ///< Whether the keys are sorted
   std::unique_ptr<column> sorted_values;   ///< Memoised grouped and sorted values
diff --git a/cpp/src/groupby/sort/group_argmax.cu b/cpp/src/groupby/sort/group_argmax.cu
index a9c098bcf61..a1d197b1307 100644
--- a/cpp/src/groupby/sort/group_argmax.cu
+++ b/cpp/src/groupby/sort/group_argmax.cu
@@ -20,6 +20,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 
@@ -31,7 +32,7 @@ std::unique_ptr<column> group_argmax(column_view const& values,
                                      cudf::device_span<size_type const> group_labels,
                                      column_view const& key_sort_order,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   auto indices = type_dispatcher(values.type(),
                                  group_reduction_dispatcher<aggregation::ARGMAX>{},
diff --git a/cpp/src/groupby/sort/group_argmin.cu b/cpp/src/groupby/sort/group_argmin.cu
index 53a514ac8a7..03243bef836 100644
--- a/cpp/src/groupby/sort/group_argmin.cu
+++ b/cpp/src/groupby/sort/group_argmin.cu
@@ -20,6 +20,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 
@@ -31,7 +32,7 @@ std::unique_ptr<column> group_argmin(column_view const& values,
                                      cudf::device_span<size_type const> group_labels,
                                      column_view const& key_sort_order,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   auto indices = type_dispatcher(values.type(),
                                  group_reduction_dispatcher<aggregation::ARGMIN>{},
diff --git a/cpp/src/groupby/sort/group_collect.cu b/cpp/src/groupby/sort/group_collect.cu
index f95ad72f453..555c5d3ad41 100644
--- a/cpp/src/groupby/sort/group_collect.cu
+++ b/cpp/src/groupby/sort/group_collect.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -50,7 +51,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
   column_view const& offsets,
   size_type num_groups,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto values_device_view = column_device_view::create(values, stream);
 
@@ -91,7 +92,7 @@ std::unique_ptr<column> group_collect(column_view const& values,
                                       size_type num_groups,
                                       null_policy null_handling,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   auto [child_column,
         offsets_column] = [null_handling, num_groups, &values, &group_offsets, stream, mr] {
diff --git a/cpp/src/groupby/sort/group_correlation.cu b/cpp/src/groupby/sort/group_correlation.cu
index 4389b833c33..152aa98a8b9 100644
--- a/cpp/src/groupby/sort/group_correlation.cu
+++ b/cpp/src/groupby/sort/group_correlation.cu
@@ -26,6 +26,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
@@ -120,7 +121,7 @@ std::unique_ptr<column> group_covariance(column_view const& values_0,
                                          size_type min_periods,
                                          size_type ddof,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   using result_type = id_to_type<type_id::FLOAT64>;
   static_assert(
@@ -181,7 +182,7 @@ std::unique_ptr<column> group_correlation(column_view const& covariance,
                                           column_view const& stddev_0,
                                           column_view const& stddev_1,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   using result_type = id_to_type<type_id::FLOAT64>;
   CUDF_EXPECTS(covariance.type().id() == type_id::FLOAT64, "Covariance result must be FLOAT64");
diff --git a/cpp/src/groupby/sort/group_count.cu b/cpp/src/groupby/sort/group_count.cu
index 2f289c8c8a7..56a4943e272 100644
--- a/cpp/src/groupby/sort/group_count.cu
+++ b/cpp/src/groupby/sort/group_count.cu
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/adjacent_difference.h>
@@ -37,7 +38,7 @@ std::unique_ptr<column> group_count_valid(column_view const& values,
                                           cudf::device_span<size_type const> group_labels,
                                           size_type num_groups,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(num_groups >= 0, "number of groups cannot be negative");
   CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
@@ -80,7 +81,7 @@ std::unique_ptr<column> group_count_valid(column_view const& values,
 std::unique_ptr<column> group_count_all(cudf::device_span<size_type const> group_offsets,
                                         size_type num_groups,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(num_groups >= 0, "number of groups cannot be negative");
 
diff --git a/cpp/src/groupby/sort/group_count_scan.cu b/cpp/src/groupby/sort/group_count_scan.cu
index 2e8fd41d984..c076f21e1f8 100644
--- a/cpp/src/groupby/sort/group_count_scan.cu
+++ b/cpp/src/groupby/sort/group_count_scan.cu
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/scan.h>
@@ -30,7 +31,7 @@ namespace groupby {
 namespace detail {
 std::unique_ptr<column> count_scan(cudf::device_span<size_type const> group_labels,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   std::unique_ptr<column> result = make_fixed_width_column(
     data_type{type_id::INT32}, group_labels.size(), mask_state::UNALLOCATED, stream, mr);
diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu
index 67c30adcd47..1000ec0d470 100644
--- a/cpp/src/groupby/sort/group_histogram.cu
+++ b/cpp/src/groupby/sort/group_histogram.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 
@@ -38,7 +39,7 @@ std::unique_ptr<column> build_histogram(column_view const& values,
                                         std::optional<column_view> const& partial_counts,
                                         size_type num_groups,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
                "Size of values column should be the same as that of group labels.",
@@ -89,7 +90,7 @@ std::unique_ptr<column> group_histogram(column_view const& values,
                                         cudf::device_span<size_type const> group_labels,
                                         size_type num_groups,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   // Empty group should be handled before reaching here.
   CUDF_EXPECTS(num_groups > 0, "Group should not be empty.", std::invalid_argument);
@@ -101,7 +102,7 @@ std::unique_ptr<column> group_merge_histogram(column_view const& values,
                                               cudf::device_span<size_type const> group_offsets,
                                               size_type num_groups,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   // Empty group should be handled before reaching here.
   CUDF_EXPECTS(num_groups > 0, "Group should not be empty.", std::invalid_argument);
diff --git a/cpp/src/groupby/sort/group_m2.cu b/cpp/src/groupby/sort/group_m2.cu
index 70b05100fb0..77f33486284 100644
--- a/cpp/src/groupby/sort/group_m2.cu
+++ b/cpp/src/groupby/sort/group_m2.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/reduce.h>
@@ -88,7 +89,7 @@ struct m2_functor {
     column_view const& group_means,
     cudf::device_span<size_type const> group_labels,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     using result_type = cudf::detail::target_type_t<T, aggregation::Kind::M2>;
     auto result       = make_numeric_column(data_type(type_to_id<result_type>()),
@@ -133,7 +134,7 @@ std::unique_ptr<column> group_m2(column_view const& values,
                                  column_view const& group_means,
                                  cudf::device_span<size_type const> group_labels,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
diff --git a/cpp/src/groupby/sort/group_max.cu b/cpp/src/groupby/sort/group_max.cu
index 148188f5fdf..60b071c25ff 100644
--- a/cpp/src/groupby/sort/group_max.cu
+++ b/cpp/src/groupby/sort/group_max.cu
@@ -17,6 +17,7 @@
 #include "groupby/sort/group_single_pass_reduction_util.cuh"
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -25,7 +26,7 @@ std::unique_ptr<column> group_max(column_view const& values,
                                   size_type num_groups,
                                   cudf::device_span<size_type const> group_labels,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
diff --git a/cpp/src/groupby/sort/group_max_scan.cu b/cpp/src/groupby/sort/group_max_scan.cu
index 8679ab09df6..270059cfcad 100644
--- a/cpp/src/groupby/sort/group_max_scan.cu
+++ b/cpp/src/groupby/sort/group_max_scan.cu
@@ -17,6 +17,7 @@
 #include "groupby/sort/group_scan_util.cuh"
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -25,7 +26,7 @@ std::unique_ptr<column> max_scan(column_view const& values,
                                  size_type num_groups,
                                  cudf::device_span<size_type const> group_labels,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(values.type(),
                          group_scan_dispatcher<aggregation::MAX>{},
diff --git a/cpp/src/groupby/sort/group_merge_lists.cu b/cpp/src/groupby/sort/group_merge_lists.cu
index 2c72128dbfb..92cce1aa00e 100644
--- a/cpp/src/groupby/sort/group_merge_lists.cu
+++ b/cpp/src/groupby/sort/group_merge_lists.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 
@@ -30,7 +31,7 @@ std::unique_ptr<column> group_merge_lists(column_view const& values,
                                           cudf::device_span<size_type const> group_offsets,
                                           size_type num_groups,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(values.type().id() == type_id::LIST,
                "Input to `group_merge_lists` must be a lists column.");
diff --git a/cpp/src/groupby/sort/group_merge_m2.cu b/cpp/src/groupby/sort/group_merge_m2.cu
index a580c9dac9d..4ad8fa5ff07 100644
--- a/cpp/src/groupby/sort/group_merge_m2.cu
+++ b/cpp/src/groupby/sort/group_merge_m2.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -131,7 +132,7 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
                                        cudf::device_span<size_type const> group_offsets,
                                        size_type num_groups,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(values.type().id() == type_id::STRUCT,
                "Input to `group_merge_m2` must be a structs column.");
diff --git a/cpp/src/groupby/sort/group_min.cu b/cpp/src/groupby/sort/group_min.cu
index 3939fc41b65..22aaf664168 100644
--- a/cpp/src/groupby/sort/group_min.cu
+++ b/cpp/src/groupby/sort/group_min.cu
@@ -17,6 +17,7 @@
 #include "groupby/sort/group_single_pass_reduction_util.cuh"
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -25,7 +26,7 @@ std::unique_ptr<column> group_min(column_view const& values,
                                   size_type num_groups,
                                   cudf::device_span<size_type const> group_labels,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
diff --git a/cpp/src/groupby/sort/group_min_scan.cu b/cpp/src/groupby/sort/group_min_scan.cu
index 7d2a88fb038..4ddc10a2e5a 100644
--- a/cpp/src/groupby/sort/group_min_scan.cu
+++ b/cpp/src/groupby/sort/group_min_scan.cu
@@ -17,6 +17,7 @@
 #include "groupby/sort/group_scan_util.cuh"
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -25,7 +26,7 @@ std::unique_ptr<column> min_scan(column_view const& values,
                                  size_type num_groups,
                                  cudf::device_span<size_type const> group_labels,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(values.type(),
                          group_scan_dispatcher<aggregation::MIN>{},
diff --git a/cpp/src/groupby/sort/group_nth_element.cu b/cpp/src/groupby/sort/group_nth_element.cu
index 694c052e42d..1bc1eef908c 100644
--- a/cpp/src/groupby/sort/group_nth_element.cu
+++ b/cpp/src/groupby/sort/group_nth_element.cu
@@ -26,6 +26,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/constant_iterator.h>
@@ -49,7 +50,7 @@ std::unique_ptr<column> group_nth_element(column_view const& values,
                                           size_type n,
                                           null_policy null_handling,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
                "Size of values column should be same as that of group labels");
diff --git a/cpp/src/groupby/sort/group_nunique.cu b/cpp/src/groupby/sort/group_nunique.cu
index 1a5f1691d5b..de11e70719a 100644
--- a/cpp/src/groupby/sort/group_nunique.cu
+++ b/cpp/src/groupby/sort/group_nunique.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
@@ -78,7 +79,7 @@ std::unique_ptr<column> group_nunique(column_view const& values,
                                       cudf::device_span<size_type const> group_offsets,
                                       null_policy null_handling,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(num_groups >= 0, "number of groups cannot be negative");
   CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
diff --git a/cpp/src/groupby/sort/group_product.cu b/cpp/src/groupby/sort/group_product.cu
index c53362f2095..83ca1059325 100644
--- a/cpp/src/groupby/sort/group_product.cu
+++ b/cpp/src/groupby/sort/group_product.cu
@@ -20,6 +20,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -28,7 +29,7 @@ std::unique_ptr<column> group_product(column_view const& values,
                                       size_type num_groups,
                                       cudf::device_span<size_type const> group_labels,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
diff --git a/cpp/src/groupby/sort/group_product_scan.cu b/cpp/src/groupby/sort/group_product_scan.cu
index e1a615730dd..40c53ceeff1 100644
--- a/cpp/src/groupby/sort/group_product_scan.cu
+++ b/cpp/src/groupby/sort/group_product_scan.cu
@@ -17,6 +17,7 @@
 #include "groupby/sort/group_scan_util.cuh"
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -25,7 +26,7 @@ std::unique_ptr<column> product_scan(column_view const& values,
                                      size_type num_groups,
                                      cudf::device_span<size_type const> group_labels,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(values.type(),
                          group_scan_dispatcher<aggregation::PRODUCT>{},
diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu
index a6bc2d5b38d..3156dfaadd0 100644
--- a/cpp/src/groupby/sort/group_quantiles.cu
+++ b/cpp/src/groupby/sort/group_quantiles.cu
@@ -29,6 +29,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -90,7 +91,7 @@ struct quantiles_functor {
     device_span<double const> quantile,
     interpolation interpolation,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     using ResultType = cudf::detail::target_type_t<T, aggregation::QUANTILE>;
 
@@ -161,7 +162,7 @@ std::unique_ptr<column> group_quantiles(column_view const& values,
                                         std::vector<double> const& quantiles,
                                         interpolation interp,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   auto dv_quantiles = cudf::detail::make_device_uvector_async(
     quantiles, stream, rmm::mr::get_current_device_resource());
diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index 5cf7844410e..0b65889f127 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/reverse_iterator.h>
@@ -100,7 +101,7 @@ std::unique_ptr<column> rank_generator(column_view const& grouped_values,
                                        scan_operator scan_op,
                                        bool has_nulls,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto const grouped_values_view = table_view{{grouped_values}};
   auto const comparator =
@@ -155,7 +156,7 @@ std::unique_ptr<column> min_rank_scan(column_view const& grouped_values,
                                       device_span<size_type const> group_labels,
                                       device_span<size_type const> group_offsets,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   return rank_generator<true>(
     grouped_values,
@@ -176,7 +177,7 @@ std::unique_ptr<column> max_rank_scan(column_view const& grouped_values,
                                       device_span<size_type const> group_labels,
                                       device_span<size_type const> group_offsets,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   return rank_generator<false>(
     grouped_values,
@@ -197,7 +198,7 @@ std::unique_ptr<column> first_rank_scan(column_view const& grouped_values,
                                         device_span<size_type const> group_labels,
                                         device_span<size_type const> group_offsets,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   auto ranks = make_fixed_width_column(
     data_type{type_to_id<size_type>()}, group_labels.size(), mask_state::UNALLOCATED, stream, mr);
@@ -218,7 +219,7 @@ std::unique_ptr<column> average_rank_scan(column_view const& grouped_values,
                                           device_span<size_type const> group_labels,
                                           device_span<size_type const> group_offsets,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   auto max_rank = max_rank_scan(grouped_values,
                                 value_order,
@@ -251,7 +252,7 @@ std::unique_ptr<column> dense_rank_scan(column_view const& grouped_values,
                                         device_span<size_type const> group_labels,
                                         device_span<size_type const> group_offsets,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   return rank_generator<true>(
     grouped_values,
@@ -272,7 +273,7 @@ std::unique_ptr<column> group_rank_to_percentage(rank_method const method,
                                                  device_span<size_type const> group_labels,
                                                  device_span<size_type const> group_offsets,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(percentage != rank_percentage::NONE, "Percentage cannot be NONE");
   auto ranks = make_fixed_width_column(
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index 3aa79f226a3..5e76dc3135a 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -52,7 +53,7 @@ std::unique_ptr<column> group_sum(column_view const& values,
                                   size_type num_groups,
                                   cudf::device_span<size_type const> group_labels,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise product
@@ -75,7 +76,7 @@ std::unique_ptr<column> group_product(column_view const& values,
                                       size_type num_groups,
                                       cudf::device_span<size_type const> group_labels,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise minimum value
@@ -98,7 +99,7 @@ std::unique_ptr<column> group_min(column_view const& values,
                                   size_type num_groups,
                                   cudf::device_span<size_type const> group_labels,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise maximum value
@@ -121,7 +122,7 @@ std::unique_ptr<column> group_max(column_view const& values,
                                   size_type num_groups,
                                   cudf::device_span<size_type const> group_labels,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate group-wise indices of maximum values.
@@ -146,7 +147,7 @@ std::unique_ptr<column> group_argmax(column_view const& values,
                                      cudf::device_span<size_type const> group_labels,
                                      column_view const& key_sort_order,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate group-wise indices of minimum values.
@@ -171,7 +172,7 @@ std::unique_ptr<column> group_argmin(column_view const& values,
                                      cudf::device_span<size_type const> group_labels,
                                      column_view const& key_sort_order,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate number of non-null values in each group of
@@ -195,7 +196,7 @@ std::unique_ptr<column> group_count_valid(column_view const& values,
                                           cudf::device_span<size_type const> group_labels,
                                           size_type num_groups,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate number of values in each group of @p values
@@ -215,7 +216,7 @@ std::unique_ptr<column> group_count_valid(column_view const& values,
 std::unique_ptr<column> group_count_all(cudf::device_span<size_type const> group_offsets,
                                         size_type num_groups,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 /**
  * @brief Internal API to compute histogram for each group in @p values.
  *
@@ -242,7 +243,7 @@ std::unique_ptr<column> group_histogram(column_view const& values,
                                         cudf::device_span<size_type const> group_labels,
                                         size_type num_groups,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate sum of squares of differences from means.
@@ -266,7 +267,7 @@ std::unique_ptr<column> group_m2(column_view const& values,
                                  column_view const& group_means,
                                  cudf::device_span<size_type const> group_labels,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise variance
@@ -296,7 +297,7 @@ std::unique_ptr<column> group_var(column_view const& values,
                                   cudf::device_span<size_type const> group_labels,
                                   size_type ddof,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise quantiles
@@ -326,7 +327,7 @@ std::unique_ptr<column> group_quantiles(column_view const& values,
                                         std::vector<double> const& quantiles,
                                         interpolation interp,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate number of unique values in each group of
@@ -358,7 +359,7 @@ std::unique_ptr<column> group_nunique(column_view const& values,
                                       cudf::device_span<size_type const> group_offsets,
                                       null_policy null_handling,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate nth values in each group of  @p values
@@ -393,7 +394,7 @@ std::unique_ptr<column> group_nth_element(column_view const& values,
                                           size_type n,
                                           null_policy null_handling,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 /**
  * @brief Internal API to collect grouped values into a lists column
  *
@@ -418,7 +419,7 @@ std::unique_ptr<column> group_collect(column_view const& values,
                                       size_type num_groups,
                                       null_policy null_handling,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to merge grouped lists into one list.
@@ -441,7 +442,7 @@ std::unique_ptr<column> group_merge_lists(column_view const& values,
                                           cudf::device_span<size_type const> group_offsets,
                                           size_type num_groups,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to merge grouped M2 values corresponding to the same key.
@@ -467,7 +468,7 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
                                        cudf::device_span<size_type const> group_offsets,
                                        size_type num_groups,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to merge multiple output of HISTOGRAM aggregation.
@@ -494,7 +495,7 @@ std::unique_ptr<column> group_merge_histogram(column_view const& values,
                                               cudf::device_span<size_type const> group_offsets,
                                               size_type num_groups,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr);
+                                              rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to find covariance of child columns of a non-nullable struct column.
@@ -521,7 +522,7 @@ std::unique_ptr<column> group_covariance(column_view const& values_0,
                                          size_type min_periods,
                                          size_type ddof,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to find correlation from covariance and standard deviation.
@@ -536,7 +537,7 @@ std::unique_ptr<column> group_correlation(column_view const& covariance,
                                           column_view const& stddev_0,
                                           column_view const& stddev_1,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace groupby
diff --git a/cpp/src/groupby/sort/group_replace_nulls.cu b/cpp/src/groupby/sort/group_replace_nulls.cu
index 49557164230..566507da230 100644
--- a/cpp/src/groupby/sort/group_replace_nulls.cu
+++ b/cpp/src/groupby/sort/group_replace_nulls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/replace.hpp>
 
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -40,7 +41,7 @@ std::unique_ptr<column> group_replace_nulls(cudf::column_view const& grouped_val
                                             device_span<size_type const> group_labels,
                                             cudf::replace_policy replace_policy,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   cudf::size_type size = grouped_value.size();
 
diff --git a/cpp/src/groupby/sort/group_scan.hpp b/cpp/src/groupby/sort/group_scan.hpp
index fd53046f7e2..6f2daae5f9d 100644
--- a/cpp/src/groupby/sort/group_scan.hpp
+++ b/cpp/src/groupby/sort/group_scan.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -40,7 +41,7 @@ std::unique_ptr<column> sum_scan(column_view const& values,
                                  size_type num_groups,
                                  device_span<size_type const> group_labels,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise cumulative product
@@ -57,7 +58,7 @@ std::unique_ptr<column> product_scan(column_view const& values,
                                      size_type num_groups,
                                      device_span<size_type const> group_labels,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise cumulative minimum value
@@ -72,7 +73,7 @@ std::unique_ptr<column> min_scan(column_view const& values,
                                  size_type num_groups,
                                  device_span<size_type const> group_labels,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise cumulative maximum value
@@ -87,7 +88,7 @@ std::unique_ptr<column> max_scan(column_view const& values,
                                  size_type num_groups,
                                  device_span<size_type const> group_labels,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate cumulative number of values in each group
@@ -99,7 +100,7 @@ std::unique_ptr<column> max_scan(column_view const& values,
  */
 std::unique_ptr<column> count_scan(device_span<size_type const> group_labels,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise min rank value
@@ -118,7 +119,7 @@ std::unique_ptr<column> min_rank_scan(column_view const& grouped_values,
                                       device_span<size_type const> group_labels,
                                       device_span<size_type const> group_offsets,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise max rank value
@@ -128,14 +129,14 @@ std::unique_ptr<column> min_rank_scan(column_view const& grouped_values,
  *                                      device_span<size_type const> group_labels,
  *                                      device_span<size_type const> group_offsets,
  *                                      rmm::cuda_stream_view stream,
- *                                      rmm::mr::device_memory_resource* mr)
+ *                                      rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<column> max_rank_scan(column_view const& grouped_values,
                                       column_view const& value_order,
                                       device_span<size_type const> group_labels,
                                       device_span<size_type const> group_offsets,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise first rank value
@@ -145,14 +146,14 @@ std::unique_ptr<column> max_rank_scan(column_view const& grouped_values,
  *                                      device_span<size_type const> group_labels,
  *                                      device_span<size_type const> group_offsets,
  *                                      rmm::cuda_stream_view stream,
- *                                      rmm::mr::device_memory_resource* mr)
+ *                                      rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<column> first_rank_scan(column_view const& grouped_values,
                                         column_view const& value_order,
                                         device_span<size_type const> group_labels,
                                         device_span<size_type const> group_offsets,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise average rank value
@@ -162,14 +163,14 @@ std::unique_ptr<column> first_rank_scan(column_view const& grouped_values,
  *                                      device_span<size_type const> group_labels,
  *                                      device_span<size_type const> group_offsets,
  *                                      rmm::cuda_stream_view stream,
- *                                      rmm::mr::device_memory_resource* mr)
+ *                                      rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<column> average_rank_scan(column_view const& grouped_values,
                                           column_view const& value_order,
                                           device_span<size_type const> group_labels,
                                           device_span<size_type const> group_offsets,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise dense rank value
@@ -186,7 +187,7 @@ std::unique_ptr<column> dense_rank_scan(column_view const& grouped_values,
                                         device_span<size_type const> group_labels,
                                         device_span<size_type const> group_offsets,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Convert groupwise rank to groupwise percentage rank
@@ -209,7 +210,7 @@ std::unique_ptr<column> group_rank_to_percentage(rank_method const method,
                                                  device_span<size_type const> group_labels,
                                                  device_span<size_type const> group_offsets,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr);
+                                                 rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace groupby
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index 2ebc8ba7d5d..b360ba2c45d 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -34,6 +34,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -60,7 +61,7 @@ struct group_scan_dispatcher {
                                      size_type num_groups,
                                      cudf::device_span<cudf::size_type const> group_labels,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return group_scan_functor<K, T>::invoke(values, num_groups, group_labels, stream, mr);
   }
@@ -89,7 +90,7 @@ struct group_scan_functor<K, T, std::enable_if_t<is_group_scan_supported<K, T>()
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     using DeviceType       = device_storage_type_t<T>;
     using OpType           = cudf::detail::corresponding_operator_t<K>;
@@ -145,7 +146,7 @@ struct group_scan_functor<K,
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     using OpType = cudf::detail::corresponding_operator_t<K>;
 
@@ -191,7 +192,7 @@ struct group_scan_functor<K,
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     if (values.is_empty()) { return cudf::empty_like(values); }
 
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 42d4b654346..5e892710d3b 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -116,7 +117,7 @@ struct group_reduction_dispatcher {
                                      size_type num_groups,
                                      cudf::device_span<cudf::size_type const> group_labels,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return group_reduction_functor<K, T>::invoke(values, num_groups, group_labels, stream, mr);
   }
@@ -149,7 +150,7 @@ struct group_reduction_functor<
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 
   {
     using SourceDType = device_storage_type_t<T>;
@@ -218,7 +219,7 @@ struct group_reduction_functor<
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     // This is be expected to be size_type.
     using ResultType = cudf::detail::target_type_t<T, K>;
diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu
index 30b6f67dffe..70f64186f21 100644
--- a/cpp/src/groupby/sort/group_std.cu
+++ b/cpp/src/groupby/sort/group_std.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -104,7 +105,7 @@ struct var_functor {
     cudf::device_span<size_type const> group_labels,
     size_type ddof,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     using ResultType = cudf::detail::target_type_t<T, aggregation::Kind::VARIANCE>;
 
@@ -175,7 +176,7 @@ std::unique_ptr<column> group_var(column_view const& values,
                                   cudf::device_span<size_type const> group_labels,
                                   size_type ddof,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
diff --git a/cpp/src/groupby/sort/group_sum.cu b/cpp/src/groupby/sort/group_sum.cu
index 0af7cb22159..316b6f395bb 100644
--- a/cpp/src/groupby/sort/group_sum.cu
+++ b/cpp/src/groupby/sort/group_sum.cu
@@ -20,6 +20,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -28,7 +29,7 @@ std::unique_ptr<column> group_sum(column_view const& values,
                                   size_type num_groups,
                                   cudf::device_span<size_type const> group_labels,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
diff --git a/cpp/src/groupby/sort/group_sum_scan.cu b/cpp/src/groupby/sort/group_sum_scan.cu
index 2efa1185899..01c4d0c2c4a 100644
--- a/cpp/src/groupby/sort/group_sum_scan.cu
+++ b/cpp/src/groupby/sort/group_sum_scan.cu
@@ -17,6 +17,7 @@
 #include "groupby/sort/group_scan_util.cuh"
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -25,7 +26,7 @@ std::unique_ptr<column> sum_scan(column_view const& values,
                                  size_type num_groups,
                                  cudf::device_span<size_type const> group_labels,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(values.type(),
                          group_scan_dispatcher<aggregation::SUM>{},
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
index 45c232aa3aa..f211c61b3b7 100644
--- a/cpp/src/groupby/sort/scan.cpp
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -35,6 +35,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -207,7 +208,7 @@ void scan_result_functor::operator()<aggregation::RANK>(aggregation const& agg)
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort_scan(
   host_span<scan_request const> requests,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   // We're going to start by creating a cache of results so that aggs that
   // depend on other aggs will not have to be recalculated. e.g. mean depends on
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index 1e6c7a9393f..4da1da089cd 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -35,6 +35,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/distance.h>
@@ -248,7 +249,7 @@ column_view sort_groupby_helper::keys_bitmask_column(rmm::cuda_stream_view strea
 }
 
 sort_groupby_helper::column_ptr sort_groupby_helper::sorted_values(
-  column_view const& values, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  column_view const& values, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   column_ptr values_sort_order =
     cudf::detail::stable_sorted_order(table_view({unsorted_keys_labels(stream), values}),
@@ -272,7 +273,7 @@ sort_groupby_helper::column_ptr sort_groupby_helper::sorted_values(
 }
 
 sort_groupby_helper::column_ptr sort_groupby_helper::grouped_values(
-  column_view const& values, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  column_view const& values, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   auto gather_map = key_sort_order(stream);
 
@@ -287,7 +288,7 @@ sort_groupby_helper::column_ptr sort_groupby_helper::grouped_values(
 }
 
 std::unique_ptr<table> sort_groupby_helper::unique_keys(rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
   auto idx_data = key_sort_order(stream).data<size_type>();
 
@@ -305,7 +306,7 @@ std::unique_ptr<table> sort_groupby_helper::unique_keys(rmm::cuda_stream_view st
 }
 
 std::unique_ptr<table> sort_groupby_helper::sorted_keys(rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
   return cudf::detail::gather(_keys,
                               key_sort_order(stream),
diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu
index b34455905d9..8f490ada8ff 100644
--- a/cpp/src/hash/md5_hash.cu
+++ b/cpp/src/hash/md5_hash.cu
@@ -29,6 +29,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/constant_iterator.h>
@@ -284,7 +285,7 @@ inline bool md5_leaf_type_check(data_type dt)
 
 std::unique_ptr<column> md5(table_view const& input,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   if (input.num_columns() == 0 || input.num_rows() == 0) {
     // Return the MD5 hash of a zero-length input.
@@ -349,7 +350,7 @@ std::unique_ptr<column> md5(table_view const& input,
 
 std::unique_ptr<column> md5(table_view const& input,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::md5(input, stream, mr);
diff --git a/cpp/src/hash/murmurhash3_x64_128.cu b/cpp/src/hash/murmurhash3_x64_128.cu
index 1fc469686e1..6c91532a193 100644
--- a/cpp/src/hash/murmurhash3_x64_128.cu
+++ b/cpp/src/hash/murmurhash3_x64_128.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 
@@ -109,7 +110,7 @@ class murmur_device_row_hasher {
 std::unique_ptr<table> murmurhash3_x64_128(table_view const& input,
                                            uint64_t seed,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto output1 = make_numeric_column(
     data_type(type_id::UINT64), input.num_rows(), mask_state::UNALLOCATED, stream, mr);
@@ -140,7 +141,7 @@ std::unique_ptr<table> murmurhash3_x64_128(table_view const& input,
 std::unique_ptr<table> murmurhash3_x64_128(table_view const& input,
                                            uint64_t seed,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::murmurhash3_x64_128(input, seed, stream, mr);
diff --git a/cpp/src/hash/murmurhash3_x86_32.cu b/cpp/src/hash/murmurhash3_x86_32.cu
index a6ab301a86e..eac72f5d995 100644
--- a/cpp/src/hash/murmurhash3_x86_32.cu
+++ b/cpp/src/hash/murmurhash3_x86_32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/tabulate.h>
 
@@ -33,7 +34,7 @@ namespace detail {
 std::unique_ptr<column> murmurhash3_x86_32(table_view const& input,
                                            uint32_t seed,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto output = make_numeric_column(data_type(type_to_id<hash_value_type>()),
                                     input.num_rows(),
@@ -62,7 +63,7 @@ std::unique_ptr<column> murmurhash3_x86_32(table_view const& input,
 std::unique_ptr<column> murmurhash3_x86_32(table_view const& input,
                                            uint32_t seed,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::murmurhash3_x86_32(input, seed, stream, mr);
diff --git a/cpp/src/hash/sha1_hash.cu b/cpp/src/hash/sha1_hash.cu
index 71253d279b9..f7609eb26af 100644
--- a/cpp/src/hash/sha1_hash.cu
+++ b/cpp/src/hash/sha1_hash.cu
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -62,7 +63,7 @@ struct SHA1Hash : HashBase<SHA1Hash> {
 
 std::unique_ptr<column> sha1(table_view const& input,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   return sha_hash<SHA1Hash>(input, stream, mr);
 }
@@ -71,7 +72,7 @@ std::unique_ptr<column> sha1(table_view const& input,
 
 std::unique_ptr<column> sha1(table_view const& input,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sha1(input, stream, mr);
diff --git a/cpp/src/hash/sha224_hash.cu b/cpp/src/hash/sha224_hash.cu
index 61480a78776..cf04504a489 100644
--- a/cpp/src/hash/sha224_hash.cu
+++ b/cpp/src/hash/sha224_hash.cu
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -63,7 +64,7 @@ struct SHA224Hash : HashBase<SHA224Hash> {
 
 std::unique_ptr<column> sha224(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   return sha_hash<SHA224Hash>(input, stream, mr);
 }
@@ -72,7 +73,7 @@ std::unique_ptr<column> sha224(table_view const& input,
 
 std::unique_ptr<column> sha224(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sha224(input, stream, mr);
diff --git a/cpp/src/hash/sha256_hash.cu b/cpp/src/hash/sha256_hash.cu
index b15cfe09d52..664913c0f4c 100644
--- a/cpp/src/hash/sha256_hash.cu
+++ b/cpp/src/hash/sha256_hash.cu
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -63,7 +64,7 @@ struct SHA256Hash : HashBase<SHA256Hash> {
 
 std::unique_ptr<column> sha256(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   return sha_hash<SHA256Hash>(input, stream, mr);
 }
@@ -72,7 +73,7 @@ std::unique_ptr<column> sha256(table_view const& input,
 
 std::unique_ptr<column> sha256(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sha256(input, stream, mr);
diff --git a/cpp/src/hash/sha384_hash.cu b/cpp/src/hash/sha384_hash.cu
index 3075d2c62f8..92192f501ec 100644
--- a/cpp/src/hash/sha384_hash.cu
+++ b/cpp/src/hash/sha384_hash.cu
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -70,7 +71,7 @@ struct SHA384Hash : HashBase<SHA384Hash> {
 
 std::unique_ptr<column> sha384(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   return sha_hash<SHA384Hash>(input, stream, mr);
 }
@@ -79,7 +80,7 @@ std::unique_ptr<column> sha384(table_view const& input,
 
 std::unique_ptr<column> sha384(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sha384(input, stream, mr);
diff --git a/cpp/src/hash/sha512_hash.cu b/cpp/src/hash/sha512_hash.cu
index d073cf1edca..244206aeeb9 100644
--- a/cpp/src/hash/sha512_hash.cu
+++ b/cpp/src/hash/sha512_hash.cu
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -70,7 +71,7 @@ struct SHA512Hash : HashBase<SHA512Hash> {
 
 std::unique_ptr<column> sha512(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   return sha_hash<SHA512Hash>(input, stream, mr);
 }
@@ -79,7 +80,7 @@ std::unique_ptr<column> sha512(table_view const& input,
 
 std::unique_ptr<column> sha512(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sha512(input, stream, mr);
diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh
index 0a22ee34918..005578cb2c2 100644
--- a/cpp/src/hash/sha_hash.cuh
+++ b/cpp/src/hash/sha_hash.cuh
@@ -28,6 +28,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
@@ -503,7 +504,7 @@ bool inline sha_leaf_type_check(data_type dt)
 template <typename Hasher>
 std::unique_ptr<column> sha_hash(table_view const& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   if (input.num_rows() == 0) { return cudf::make_empty_column(cudf::type_id::STRING); }
 
diff --git a/cpp/src/hash/xxhash_64.cu b/cpp/src/hash/xxhash_64.cu
index e17bc134420..4366c12b453 100644
--- a/cpp/src/hash/xxhash_64.cu
+++ b/cpp/src/hash/xxhash_64.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/tabulate.h>
 
@@ -298,7 +299,7 @@ class device_row_hasher {
 std::unique_ptr<column> xxhash_64(table_view const& input,
                                   uint64_t seed,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto output = make_numeric_column(data_type(type_to_id<hash_value_type>()),
                                     input.num_rows(),
@@ -327,7 +328,7 @@ std::unique_ptr<column> xxhash_64(table_view const& input,
 std::unique_ptr<column> xxhash_64(table_view const& input,
                                   uint64_t seed,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::xxhash_64(input, seed, stream, mr);
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index 9f36280930d..3109a36cbcf 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -24,6 +24,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <dlpack/dlpack.h>
 
@@ -133,7 +134,7 @@ struct dltensor_context {
 namespace detail {
 std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(nullptr != managed_tensor, "managed_tensor is null");
   auto const& tensor = managed_tensor->dl_tensor;
@@ -219,7 +220,7 @@ std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
 
 DLManagedTensor* to_dlpack(table_view const& input,
                            rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
+                           rmm::device_async_resource_ref mr)
 {
   auto const num_rows = input.num_rows();
   auto const num_cols = input.num_columns();
@@ -298,13 +299,13 @@ DLManagedTensor* to_dlpack(table_view const& input,
 }  // namespace detail
 
 std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_dlpack(managed_tensor, cudf::get_default_stream(), mr);
 }
 
-DLManagedTensor* to_dlpack(table_view const& input, rmm::mr::device_memory_resource* mr)
+DLManagedTensor* to_dlpack(table_view const& input, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_dlpack(input, cudf::get_default_stream(), mr);
diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu
index 2a524c773c0..f100ca0cc2b 100644
--- a/cpp/src/interop/from_arrow.cu
+++ b/cpp/src/interop/from_arrow.cu
@@ -34,6 +34,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 
@@ -100,7 +101,7 @@ struct dispatch_to_cudf_column {
    */
   std::unique_ptr<rmm::device_buffer> get_mask_buffer(arrow::Array const& array,
                                                       rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr)
+                                                      rmm::device_async_resource_ref mr)
   {
     if (array.null_bitmap_data() == nullptr) {
       return std::make_unique<rmm::device_buffer>(0, stream, mr);
@@ -126,7 +127,7 @@ struct dispatch_to_cudf_column {
 
   template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
   std::unique_ptr<column> operator()(
-    arrow::Array const&, data_type, bool, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+    arrow::Array const&, data_type, bool, rmm::cuda_stream_view, rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Unsupported type in from_arrow.");
   }
@@ -136,7 +137,7 @@ struct dispatch_to_cudf_column {
                                      data_type type,
                                      bool skip_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto data_buffer         = array.data()->buffers[1];
     size_type const num_rows = array.length();
@@ -186,7 +187,7 @@ std::unique_ptr<column> get_column(arrow::Array const& array,
                                    data_type type,
                                    bool skip_mask,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 template <>
 std::unique_ptr<column> dispatch_to_cudf_column::operator()<numeric::decimal128>(
@@ -194,7 +195,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<numeric::decimal128>
   data_type type,
   bool skip_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   using DeviceType = __int128_t;
 
@@ -230,12 +231,11 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<numeric::decimal128>
 }
 
 template <>
-std::unique_ptr<column> dispatch_to_cudf_column::operator()<bool>(
-  arrow::Array const& array,
-  data_type,
-  bool skip_mask,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> dispatch_to_cudf_column::operator()<bool>(arrow::Array const& array,
+                                                                  data_type,
+                                                                  bool skip_mask,
+                                                                  rmm::cuda_stream_view stream,
+                                                                  rmm::device_async_resource_ref mr)
 {
   auto data_buffer = array.data()->buffers[1];
   // mask-to-bools expects the mask to be bitmask_type aligned/padded
@@ -273,7 +273,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::string_view>(
   data_type,
   bool,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   if (array.length() == 0) { return make_empty_column(type_id::STRING); }
   auto str_array    = static_cast<arrow::StringArray const*>(&array);
@@ -311,7 +311,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::dictionary32>(
   data_type,
   bool,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto dict_array  = static_cast<arrow::DictionaryArray const*>(&array);
   auto dict_type   = arrow_to_cudf_type(*(dict_array->dictionary()->type()));
@@ -344,7 +344,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::struct_view>(
   data_type,
   bool,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto struct_array = static_cast<arrow::StructArray const*>(&array);
   std::vector<std::unique_ptr<column>> child_columns;
@@ -377,7 +377,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::list_view>(
   data_type,
   bool,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto list_array   = static_cast<arrow::ListArray const*>(&array);
   auto offset_array = std::make_unique<arrow::Int32Array>(
@@ -412,7 +412,7 @@ std::unique_ptr<column> get_column(arrow::Array const& array,
                                    data_type type,
                                    bool skip_mask,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   return type.id() != type_id::EMPTY
            ? type_dispatcher(type, dispatch_to_cudf_column{}, array, type, skip_mask, stream, mr)
@@ -423,7 +423,7 @@ std::unique_ptr<column> get_column(arrow::Array const& array,
 
 std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   if (input_table.num_columns() == 0) { return std::make_unique<table>(); }
   std::vector<std::unique_ptr<column>> columns;
@@ -464,7 +464,7 @@ std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
 
 std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   auto maybe_array = arrow::MakeArrayFromScalar(input, 1);
   if (!maybe_array.ok()) { CUDF_FAIL("Failed to create array"); }
@@ -484,7 +484,7 @@ std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
 
 std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -493,7 +493,7 @@ std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
 
 std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
index e824412e71c..1754d1493bd 100644
--- a/cpp/src/interop/to_arrow_device.cu
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -34,6 +34,7 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -288,10 +289,7 @@ int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column const
 
 struct dispatch_to_arrow_device {
   template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
-  int operator()(cudf::column&&,
-                 rmm::cuda_stream_view,
-                 rmm::mr::device_memory_resource*,
-                 ArrowArray*)
+  int operator()(cudf::column&&, rmm::cuda_stream_view, rmm::device_async_resource_ref, ArrowArray*)
   {
     CUDF_FAIL("Unsupported type for to_arrow_device");
   }
@@ -299,7 +297,7 @@ struct dispatch_to_arrow_device {
   template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
   int operator()(cudf::column&& column,
                  rmm::cuda_stream_view stream,
-                 rmm::mr::device_memory_resource* mr,
+                 rmm::device_async_resource_ref mr,
                  ArrowArray* out)
   {
     nanoarrow::UniqueArray tmp;
@@ -337,7 +335,7 @@ template <typename DeviceType>
 int decimals_to_arrow(cudf::column&& input,
                       int32_t precision,
                       rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr,
+                      rmm::device_async_resource_ref mr,
                       ArrowArray* out)
 {
   nanoarrow::UniqueArray tmp;
@@ -387,7 +385,7 @@ int decimals_to_arrow(cudf::column&& input,
 template <>
 int dispatch_to_arrow_device::operator()<numeric::decimal32>(cudf::column&& column,
                                                              rmm::cuda_stream_view stream,
-                                                             rmm::mr::device_memory_resource* mr,
+                                                             rmm::device_async_resource_ref mr,
                                                              ArrowArray* out)
 {
   using DeviceType = int32_t;
@@ -398,7 +396,7 @@ int dispatch_to_arrow_device::operator()<numeric::decimal32>(cudf::column&& colu
 template <>
 int dispatch_to_arrow_device::operator()<numeric::decimal64>(cudf::column&& column,
                                                              rmm::cuda_stream_view stream,
-                                                             rmm::mr::device_memory_resource* mr,
+                                                             rmm::device_async_resource_ref mr,
                                                              ArrowArray* out)
 {
   using DeviceType = int64_t;
@@ -409,7 +407,7 @@ int dispatch_to_arrow_device::operator()<numeric::decimal64>(cudf::column&& colu
 template <>
 int dispatch_to_arrow_device::operator()<numeric::decimal128>(cudf::column&& column,
                                                               rmm::cuda_stream_view stream,
-                                                              rmm::mr::device_memory_resource* mr,
+                                                              rmm::device_async_resource_ref mr,
                                                               ArrowArray* out)
 {
   using DeviceType = __int128_t;
@@ -420,7 +418,7 @@ int dispatch_to_arrow_device::operator()<numeric::decimal128>(cudf::column&& col
 template <>
 int dispatch_to_arrow_device::operator()<bool>(cudf::column&& column,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr,
+                                               rmm::device_async_resource_ref mr,
                                                ArrowArray* out)
 {
   nanoarrow::UniqueArray tmp;
@@ -442,7 +440,7 @@ int dispatch_to_arrow_device::operator()<bool>(cudf::column&& column,
 template <>
 int dispatch_to_arrow_device::operator()<cudf::string_view>(cudf::column&& column,
                                                             rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource* mr,
+                                                            rmm::device_async_resource_ref mr,
                                                             ArrowArray* out)
 {
   nanoarrow::UniqueArray tmp;
@@ -478,19 +476,19 @@ int dispatch_to_arrow_device::operator()<cudf::string_view>(cudf::column&& colum
 template <>
 int dispatch_to_arrow_device::operator()<cudf::list_view>(cudf::column&& column,
                                                           rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource* mr,
+                                                          rmm::device_async_resource_ref mr,
                                                           ArrowArray* out);
 
 template <>
 int dispatch_to_arrow_device::operator()<cudf::dictionary32>(cudf::column&& column,
                                                              rmm::cuda_stream_view stream,
-                                                             rmm::mr::device_memory_resource* mr,
+                                                             rmm::device_async_resource_ref mr,
                                                              ArrowArray* out);
 
 template <>
 int dispatch_to_arrow_device::operator()<cudf::struct_view>(cudf::column&& column,
                                                             rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource* mr,
+                                                            rmm::device_async_resource_ref mr,
                                                             ArrowArray* out)
 {
   nanoarrow::UniqueArray tmp;
@@ -523,7 +521,7 @@ int dispatch_to_arrow_device::operator()<cudf::struct_view>(cudf::column&& colum
 template <>
 int dispatch_to_arrow_device::operator()<cudf::list_view>(cudf::column&& column,
                                                           rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource* mr,
+                                                          rmm::device_async_resource_ref mr,
                                                           ArrowArray* out)
 {
   nanoarrow::UniqueArray tmp;
@@ -557,7 +555,7 @@ int dispatch_to_arrow_device::operator()<cudf::list_view>(cudf::column&& column,
 template <>
 int dispatch_to_arrow_device::operator()<cudf::dictionary32>(cudf::column&& column,
                                                              rmm::cuda_stream_view stream,
-                                                             rmm::mr::device_memory_resource* mr,
+                                                             rmm::device_async_resource_ref mr,
                                                              ArrowArray* out)
 {
   nanoarrow::UniqueArray tmp;
@@ -639,7 +637,7 @@ unique_schema_t to_arrow_schema(cudf::table_view const& input,
 
 unique_device_array_t to_arrow_device(cudf::table&& table,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   nanoarrow::UniqueArray tmp;
   NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_STRUCT));
@@ -689,7 +687,7 @@ unique_device_array_t to_arrow_device(cudf::table&& table,
 
 unique_device_array_t to_arrow_device(cudf::column&& col,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   nanoarrow::UniqueArray tmp;
   if (col.type().id() == cudf::type_id::EMPTY) {
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index 03fd663040a..814efe2b5a1 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -33,6 +33,7 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/equal.h>
 #include <thrust/functional.h>
@@ -373,7 +374,7 @@ std::vector<column_buffer> decode_data(metadata& meta,
                                        std::vector<std::pair<int, std::string>> const& selection,
                                        std::vector<data_type> const& column_types,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto out_buffers = std::vector<column_buffer>();
 
@@ -483,7 +484,7 @@ std::vector<column_buffer> decode_data(metadata& meta,
 table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
                               avro_reader_options const& options,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   auto skip_rows = options.get_skip_rows();
   auto num_rows  = options.get_num_rows();
diff --git a/cpp/src/io/csv/durations.cu b/cpp/src/io/csv/durations.cu
index 76b1b46dc61..918951d5902 100644
--- a/cpp/src/io/csv/durations.cu
+++ b/cpp/src/io/csv/durations.cu
@@ -24,6 +24,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -174,7 +175,7 @@ struct dispatch_from_durations_fn {
   template <typename T, std::enable_if_t<cudf::is_duration<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& durations,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     size_type strings_count = durations.size();
     auto column             = column_device_view::create(durations, stream);
@@ -211,7 +212,7 @@ struct dispatch_from_durations_fn {
   template <typename T, std::enable_if_t<not cudf::is_duration<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("Values for from_durations function must be a duration type.");
   }
@@ -221,7 +222,7 @@ struct dispatch_from_durations_fn {
 
 std::unique_ptr<column> pandas_format_durations(column_view const& durations,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   size_type strings_count = durations.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
diff --git a/cpp/src/io/csv/durations.hpp b/cpp/src/io/csv/durations.hpp
index ac925011c58..f671f435eeb 100644
--- a/cpp/src/io/csv/durations.hpp
+++ b/cpp/src/io/csv/durations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -30,7 +31,7 @@ namespace csv {
 
 std::unique_ptr<column> pandas_format_durations(column_view const& durations,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr);
+                                                rmm::device_async_resource_ref mr);
 
 }  // namespace csv
 }  // namespace detail
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 02daf4655db..67c1194578a 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -39,6 +39,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -574,7 +575,7 @@ std::vector<column_buffer> decode_data(parse_options const& parse_opts,
                                        int32_t num_actual_columns,
                                        int32_t num_active_columns,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   // Alloc output; columns' data memory is still expected for empty dataframe
   std::vector<column_buffer> out_buffers;
@@ -667,7 +668,7 @@ table_with_metadata read_csv(cudf::io::datasource* source,
                              csv_reader_options const& reader_opts,
                              parse_options const& parse_opts,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   std::vector<char> header;
 
@@ -995,7 +996,7 @@ parse_options make_parse_options(csv_reader_options const& reader_opts,
 table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
                              csv_reader_options const& options,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   auto parse_options = make_parse_options(options, stream);
 
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index c143d258448..335ce77e3e3 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -41,6 +41,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/host_vector.h>
@@ -140,7 +141,7 @@ struct column_to_strings_fn {
 
   explicit column_to_strings_fn(csv_writer_options const& options,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
     : options_(options), stream_(stream), mr_(mr)
   {
   }
@@ -277,7 +278,7 @@ struct column_to_strings_fn {
  private:
   csv_writer_options const& options_;
   rmm::cuda_stream_view stream_;
-  rmm::mr::device_memory_resource* mr_;
+  rmm::device_async_resource_ref mr_;
 };
 }  // unnamed namespace
 
@@ -288,7 +289,7 @@ void write_chunked_begin(data_sink* out_sink,
                          host_span<std::string const> user_column_names,
                          csv_writer_options const& options,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
+                         rmm::device_async_resource_ref mr)
 {
   if (options.is_enabled_include_header()) {
     // need to generate column names if names are not provided
@@ -354,7 +355,7 @@ void write_chunked(data_sink* out_sink,
                    strings_column_view const& str_column_view,
                    csv_writer_options const& options,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
+                   rmm::device_async_resource_ref mr)
 {
   // algorithm outline:
   //
@@ -410,7 +411,7 @@ void write_csv(data_sink* out_sink,
                host_span<std::string const> user_column_names,
                csv_writer_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+               rmm::device_async_resource_ref mr)
 {
   // write header: column names separated by delimiter:
   // (even for tables with no rows)
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 46c6c67c8df..f0a37839810 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -36,6 +36,8 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <algorithm>
 
 namespace cudf::io {
@@ -156,8 +158,7 @@ std::vector<std::unique_ptr<data_sink>> make_datasinks(sink_info const& info)
 
 }  // namespace
 
-table_with_metadata read_avro(avro_reader_options const& options,
-                              rmm::mr::device_memory_resource* mr)
+table_with_metadata read_avro(avro_reader_options const& options, rmm::device_async_resource_ref mr)
 {
   namespace avro = cudf::io::detail::avro;
 
@@ -201,7 +202,7 @@ compression_type infer_compression_type(compression_type compression, source_inf
 
 table_with_metadata read_json(json_reader_options options,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -216,7 +217,7 @@ table_with_metadata read_json(json_reader_options options,
 
 void write_json(json_writer_options const& options,
                 rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr)
+                rmm::device_async_resource_ref mr)
 {
   auto sinks = make_datasinks(options.get_sink());
   CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for JSON writing");
@@ -231,7 +232,7 @@ void write_json(json_writer_options const& options,
 
 table_with_metadata read_csv(csv_reader_options options,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -253,7 +254,7 @@ table_with_metadata read_csv(csv_reader_options options,
 // Freeform API wraps the detail writer class API
 void write_csv(csv_writer_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+               rmm::device_async_resource_ref mr)
 {
   using namespace cudf::io::detail;
 
@@ -413,7 +414,7 @@ orc_metadata read_orc_metadata(source_info const& src_info, rmm::cuda_stream_vie
  */
 table_with_metadata read_orc(orc_reader_options const& options,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -481,7 +482,7 @@ namespace detail_parquet = cudf::io::parquet::detail;
 
 table_with_metadata read_parquet(parquet_reader_options const& options,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -569,7 +570,7 @@ std::unique_ptr<std::vector<uint8_t>> write_parquet(parquet_writer_options const
 chunked_parquet_reader::chunked_parquet_reader(std::size_t chunk_read_limit,
                                                parquet_reader_options const& options,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
   : reader{std::make_unique<detail_parquet::chunked_reader>(
       chunk_read_limit, 0, make_datasources(options.get_source()), options, stream, mr)}
 {
@@ -582,7 +583,7 @@ chunked_parquet_reader::chunked_parquet_reader(std::size_t chunk_read_limit,
                                                std::size_t pass_read_limit,
                                                parquet_reader_options const& options,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
   : reader{std::make_unique<detail_parquet::chunked_reader>(chunk_read_limit,
                                                             pass_read_limit,
                                                             make_datasources(options.get_source()),
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 9d40c657396..7117af8948b 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <cuda/functional>
@@ -481,7 +482,7 @@ void make_device_json_column(device_span<SymbolT const> input,
                              bool is_array_of_arrays,
                              cudf::io::json_reader_options const& options,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -893,7 +894,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
   cudf::io::parse_options const& options,
   std::optional<schema_element> schema,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto validity_size_check = [](device_json_column& json_col) {
@@ -1050,7 +1051,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
 table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                                              cudf::io::json_reader_options const& options,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index b3a029224d7..eb06ea0177e 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/discard_iterator.h>
 
@@ -299,7 +300,7 @@ namespace detail {
 
 rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT>&& inbuf,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   auto parser = fst::detail::make_fst(
     fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs),
@@ -323,7 +324,7 @@ rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT
 
 rmm::device_uvector<SymbolT> normalize_whitespace(rmm::device_uvector<SymbolT>&& inbuf,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   auto parser = fst::detail::make_fst(
     fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 1b7976dab89..ad807b57766 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/device/device_radix_sort.cuh>
 #include <cuco/static_set.cuh>
@@ -218,7 +219,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                                     device_span<SymbolOffsetT const> token_indices,
                                     bool is_strict_nested_boundaries,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   // Whether a token does represent a node in the tree representation
@@ -634,7 +635,7 @@ std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<size_type>> hash_n
   bool is_array_of_arrays,
   bool is_enabled_lines,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto const num_nodes = parent_node_ids.size();
@@ -779,7 +780,7 @@ std::pair<rmm::device_uvector<NodeIndexT>, rmm::device_uvector<NodeIndexT>> gene
   bool is_array_of_arrays,
   bool is_enabled_lines,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto const num_nodes = d_tree.node_categories.size();
@@ -848,7 +849,7 @@ rmm::device_uvector<size_type> compute_row_offsets(rmm::device_uvector<NodeIndex
                                                    bool is_array_of_arrays,
                                                    bool is_enabled_lines,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto const num_nodes = d_tree.node_categories.size();
@@ -947,7 +948,7 @@ records_orient_tree_traversal(device_span<SymbolT const> d_input,
                               bool is_array_of_arrays,
                               bool is_enabled_lines,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto [new_col_id, new_parent_col_id] =
diff --git a/cpp/src/io/json/legacy/read_json.hpp b/cpp/src/io/json/legacy/read_json.hpp
index 32d05c432b4..2c02fdd402f 100644
--- a/cpp/src/io/json/legacy/read_json.hpp
+++ b/cpp/src/io/json/legacy/read_json.hpp
@@ -17,6 +17,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/mr/memory_resource.h>
 
@@ -32,6 +33,6 @@ namespace cudf::io::json::detail::legacy {
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::io::json::detail::legacy
diff --git a/cpp/src/io/json/legacy/reader_impl.cu b/cpp/src/io/json/legacy/reader_impl.cu
index f9d0f6895b9..846b3cfab4e 100644
--- a/cpp/src/io/json/legacy/reader_impl.cu
+++ b/cpp/src/io/json/legacy/reader_impl.cu
@@ -39,6 +39,7 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
@@ -486,7 +487,7 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
                                           device_span<uint64_t const> rec_starts,
                                           device_span<char const> data,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   auto const num_columns = dtypes.size();
   auto const num_records = rec_starts.size();
@@ -598,7 +599,7 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(not sources.empty(), "No sources were defined");
   CUDF_EXPECTS(sources.size() == 1 or reader_opts.get_compression() == compression_type::NONE,
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 5d54e340e2b..a302785cee8 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -22,6 +22,8 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <map>
 #include <vector>
 
@@ -172,7 +174,7 @@ struct device_json_column {
    * @param stream The CUDA stream to which kernels are dispatched
    * @param mr Optional, resource with which to allocate
    */
-  device_json_column(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  device_json_column(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
     : string_offsets(0, stream),
       string_lengths(0, stream),
       child_offsets(0, stream, mr),
@@ -232,7 +234,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                                     device_span<SymbolOffsetT const> token_indices,
                                     bool is_strict_nested_boundaries,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @brief Traverse the tree representation of the JSON input in records orient format and populate
@@ -253,7 +255,7 @@ records_orient_tree_traversal(device_span<SymbolT const> d_input,
                               bool is_array_of_arrays,
                               bool is_enabled_lines,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 /**
  * @brief Searches for and selects nodes at level `row_array_children_level`. For each selected
@@ -307,7 +309,7 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt
 table_with_metadata device_parse_nested_json(device_span<SymbolT const> input,
                                              cudf::io::json_reader_options const& options,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 /**
  * @brief Get the path data type of a column by path if present in input schema
@@ -347,7 +349,7 @@ struct path_from_tree {
 table_with_metadata host_parse_nested_json(device_span<SymbolT const> input,
                                            cudf::io::json_reader_options const& options,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 4ddbe735963..8da1bb3ddfc 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -36,6 +36,7 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/device_vector.h>
 #include <thrust/iterator/discard_iterator.h>
@@ -1531,7 +1532,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   device_span<SymbolT const> json_in,
   cudf::io::json_reader_options const& options,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   check_input_size(json_in.size());
 
@@ -1664,7 +1665,7 @@ void make_json_column(json_column& root_column,
                       cudf::io::json_reader_options const& options,
                       bool include_quote_char,
                       rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr)
+                      rmm::device_async_resource_ref mr)
 {
   // Range of encapsulating function that parses to internal columnar data representation
   CUDF_FUNC_RANGE();
@@ -2064,7 +2065,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
   cudf::io::json_reader_options const& options,
   std::optional<schema_element> schema,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   // Range of orchestrating/encapsulating function
   CUDF_FUNC_RANGE();
@@ -2222,7 +2223,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
 table_with_metadata host_parse_nested_json(device_span<SymbolT const> d_input,
                                            cudf::io::json_reader_options const& options,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   // Range of orchestrating/encapsulating function
   CUDF_FUNC_RANGE();
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index b03e0dd452b..3ea8639641c 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -25,6 +25,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/scatter.h>
@@ -205,7 +206,7 @@ auto get_record_range_raw_input(host_span<std::unique_ptr<datasource>> sources,
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp
index d05134fa837..0c30b4cad46 100644
--- a/cpp/src/io/json/read_json.hpp
+++ b/cpp/src/io/json/read_json.hpp
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -31,7 +32,7 @@ namespace cudf::io::json::detail {
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 size_type find_first_delimiter(device_span<char const> d_data,
                                char const delimiter,
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index 8c3aceeefd4..596b3381eaf 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -47,6 +47,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -167,7 +168,7 @@ struct escape_strings_fn {
 
   std::unique_ptr<column> get_escaped_strings(column_view const& column_v,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
   {
     auto [offsets_column, chars] =
       cudf::strings::detail::make_strings_children(*this, column_v.size(), stream, mr);
@@ -256,7 +257,7 @@ std::unique_ptr<column> struct_to_strings(table_view const& strings_columns,
                                           string_scalar const& narep,
                                           bool include_nulls,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(column_names.type().id() == type_id::STRING, "Column names must be of type string");
@@ -373,7 +374,7 @@ std::unique_ptr<column> join_list_of_strings(lists_column_view const& lists_stri
                                              string_view const element_separator,
                                              string_view const element_narep,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -497,7 +498,7 @@ struct column_to_strings_fn {
 
   explicit column_to_strings_fn(json_writer_options const& options,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
     : options_(options),
       stream_(stream),
       mr_(mr),
@@ -740,7 +741,7 @@ struct column_to_strings_fn {
  private:
   json_writer_options const& options_;
   rmm::cuda_stream_view stream_;
-  rmm::mr::device_memory_resource* mr_;
+  rmm::device_async_resource_ref mr_;
   string_scalar const narep;  // "null"
   // struct convert constants
   string_scalar const struct_value_separator;  // ","
@@ -804,7 +805,7 @@ void write_chunked(data_sink* out_sink,
                    int const skip_last_chars,
                    json_writer_options const& options,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
+                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column.");
@@ -828,7 +829,7 @@ void write_json(data_sink* out_sink,
                 table_view const& table,
                 json_writer_options const& options,
                 rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr)
+                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   std::vector<column_name_info> user_column_names = [&]() {
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index f078e20f7e6..77151f5b7b8 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -18,12 +18,14 @@
 #include "reader_impl_chunking.hpp"
 #include "reader_impl_helpers.hpp"
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf::io::orc::detail {
 
 reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
                    orc_reader_options const& options,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
+                   rmm::device_async_resource_ref mr)
   : _stream(stream),
     _mr(mr),
     _timestamp_type{options.get_timestamp_type()},
@@ -119,7 +121,7 @@ table_with_metadata reader::impl::read_chunk_internal()
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                orc_reader_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+               rmm::device_async_resource_ref mr)
   : _impl{std::make_unique<impl>(std::move(sources), options, stream, mr)}
 {
 }
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index ab8eaebeb61..8b859da07e9 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -24,6 +24,7 @@
 #include <cudf/io/orc.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <optional>
@@ -50,7 +51,7 @@ class reader::impl {
   explicit impl(std::vector<std::unique_ptr<datasource>>&& sources,
                 orc_reader_options const& options,
                 rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr);
+                rmm::device_async_resource_ref mr);
 
   /**
    * @brief Read an entire set or a subset of data and returns a set of columns
@@ -93,7 +94,7 @@ class reader::impl {
   table_with_metadata read_chunk_internal();
 
   rmm::cuda_stream_view const _stream;
-  rmm::mr::device_memory_resource* const _mr;
+  rmm::device_async_resource_ref const _mr;
 
   // Reader configs
   data_type const _timestamp_type;  // Override output timestamp resolution
diff --git a/cpp/src/io/orc/reader_impl_helpers.cpp b/cpp/src/io/orc/reader_impl_helpers.cpp
index ea4e5dcfaab..c943ae17d97 100644
--- a/cpp/src/io/orc/reader_impl_helpers.cpp
+++ b/cpp/src/io/orc/reader_impl_helpers.cpp
@@ -16,6 +16,8 @@
 
 #include "reader_impl_helpers.hpp"
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf::io::orc::detail {
 
 std::unique_ptr<column> create_empty_column(size_type orc_col_id,
@@ -111,7 +113,7 @@ column_buffer assemble_buffer(size_type orc_col_id,
                               column_hierarchy const& selected_columns,
                               std::vector<std::vector<column_buffer>>& col_buffers,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   auto const col_id = col_meta.orc_col_map[level][orc_col_id];
   auto& col_buffer  = col_buffers[level][col_id];
diff --git a/cpp/src/io/orc/reader_impl_helpers.hpp b/cpp/src/io/orc/reader_impl_helpers.hpp
index 22482bad486..6645eecbd29 100644
--- a/cpp/src/io/orc/reader_impl_helpers.hpp
+++ b/cpp/src/io/orc/reader_impl_helpers.hpp
@@ -23,6 +23,7 @@
 #include <cudf/io/orc.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -147,6 +148,6 @@ column_buffer assemble_buffer(size_type orc_col_id,
                               column_hierarchy const& selected_columns,
                               std::vector<std::vector<column_buffer>>& col_buffers,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 6c59f83bc46..04cb223c696 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -34,6 +34,7 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/fill.h>
@@ -401,7 +402,7 @@ rmm::device_buffer decompress_stripe_data(
 void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
                       host_span<column_buffer> out_buffers,
                       rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr)
+                      rmm::device_async_resource_ref mr)
 {
   auto const num_stripes = chunks.size().first;
   auto const num_columns = chunks.size().second;
@@ -492,7 +493,7 @@ void decode_stream_data(std::size_t num_dicts,
                         cudf::detail::device_2dspan<gpu::RowGroup> row_groups,
                         std::vector<column_buffer>& out_buffers,
                         rmm::cuda_stream_view stream,
-                        rmm::mr::device_memory_resource* mr)
+                        rmm::device_async_resource_ref mr)
 {
   auto const num_stripes = chunks.size().first;
   auto const num_columns = chunks.size().second;
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index f43a8fd24c4..9869dafadfb 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -29,6 +29,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <algorithm>
 #include <list>
@@ -129,7 +130,7 @@ struct stats_caster {
     size_t col_idx,
     cudf::data_type dtype,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
+    rmm::device_async_resource_ref mr) const
   {
     // List, Struct, Dictionary types are not supported
     if constexpr (cudf::is_compound<T>() && !std::is_same_v<T, string_view>) {
@@ -165,7 +166,7 @@ struct stats_caster {
 
         static auto make_strings_children(host_span<string_view> host_strings,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
         {
           std::vector<char> chars{};
           std::vector<cudf::size_type> offsets(1, 0);
@@ -182,7 +183,7 @@ struct stats_caster {
 
         auto to_device(cudf::data_type dtype,
                        rmm::cuda_stream_view stream,
-                       rmm::mr::device_memory_resource* mr)
+                       rmm::device_async_resource_ref mr)
         {
           if constexpr (std::is_same_v<T, string_view>) {
             auto [d_chars, d_offsets] = make_strings_children(val, stream, mr);
diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp
index 17d7c07bc91..170f7503134 100644
--- a/cpp/src/io/parquet/reader.cpp
+++ b/cpp/src/io/parquet/reader.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #include "reader_impl.hpp"
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf::io::parquet::detail {
 
 reader::reader() = default;
@@ -23,7 +25,7 @@ reader::reader() = default;
 reader::reader(std::vector<std::unique_ptr<datasource>>&& sources,
                parquet_reader_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+               rmm::device_async_resource_ref mr)
   : _impl(std::make_unique<impl>(std::move(sources), options, stream, mr))
 {
 }
@@ -47,7 +49,7 @@ chunked_reader::chunked_reader(std::size_t chunk_read_limit,
                                std::vector<std::unique_ptr<datasource>>&& sources,
                                parquet_reader_options const& options,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   _impl = std::make_unique<impl>(
     chunk_read_limit, pass_read_limit, std::move(sources), options, stream, mr);
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 2356878f6ba..e7409f45e13 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -23,6 +23,8 @@
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <bitset>
 #include <numeric>
 
@@ -362,7 +364,7 @@ void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_row
 reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
                    parquet_reader_options const& options,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
+                   rmm::device_async_resource_ref mr)
   : impl(0 /*chunk_read_limit*/,
          0 /*input_pass_read_limit*/,
          std::forward<std::vector<std::unique_ptr<cudf::io::datasource>>>(sources),
@@ -377,7 +379,7 @@ reader::impl::impl(std::size_t chunk_read_limit,
                    std::vector<std::unique_ptr<datasource>>&& sources,
                    parquet_reader_options const& options,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
+                   rmm::device_async_resource_ref mr)
   : _stream{stream},
     _mr{mr},
     _sources{std::move(sources)},
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 185419a5b46..6c6cedf4e76 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -31,6 +31,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <optional>
@@ -57,7 +59,7 @@ class reader::impl {
   explicit impl(std::vector<std::unique_ptr<datasource>>&& sources,
                 parquet_reader_options const& options,
                 rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr);
+                rmm::device_async_resource_ref mr);
 
   /**
    * @brief Read an entire set or a subset of data and returns a set of columns
@@ -108,7 +110,7 @@ class reader::impl {
                 std::vector<std::unique_ptr<datasource>>&& sources,
                 parquet_reader_options const& options,
                 rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr);
+                rmm::device_async_resource_ref mr);
 
   /**
    * @copydoc cudf::io::chunked_parquet_reader::has_next
@@ -346,7 +348,7 @@ class reader::impl {
 
  private:
   rmm::cuda_stream_view _stream;
-  rmm::mr::device_memory_resource* _mr = nullptr;
+  rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
 
   std::vector<std::unique_ptr<datasource>> _sources;
   std::unique_ptr<aggregate_reader_metadata> _metadata;
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 8e37564fc35..976d735e010 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -37,6 +37,7 @@
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
@@ -306,7 +307,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
                                               byte_range_info byte_range,
                                               bool strip_delimiters,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -565,7 +566,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
                                               std::optional<byte_range_info> byte_range,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   return multibyte_split(
     source, delimiter, parse_options{byte_range.value_or(create_byte_range_info_max())}, mr);
@@ -574,7 +575,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
                                               parse_options options,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   auto stream = cudf::get_default_stream();
 
@@ -586,7 +587,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
 
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   return multibyte_split(source, delimiter, parse_options{}, mr);
 }
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 96503e4907b..5dc2291abdc 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -26,6 +26,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <iomanip>
 #include <sstream>
@@ -102,7 +103,7 @@ void copy_buffer_data(string_policy const& buff, string_policy& new_buff)
 template <class string_policy>
 void column_buffer_base<string_policy>::create(size_type _size,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   size = _size;
   _mr  = mr;
@@ -286,7 +287,7 @@ template <class string_policy>
 std::unique_ptr<column> empty_like(column_buffer_base<string_policy>& buffer,
                                    column_name_info* schema_info,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   if (schema_info != nullptr) { schema_info->name = buffer.name; }
 
@@ -357,12 +358,12 @@ template std::unique_ptr<column> make_column<pointer_type>(
 template std::unique_ptr<column> empty_like<string_type>(string_column_buffer& buffer,
                                                          column_name_info* schema_info,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr);
+                                                         rmm::device_async_resource_ref mr);
 
 template std::unique_ptr<column> empty_like<pointer_type>(pointer_column_buffer& buffer,
                                                           column_name_info* schema_info,
                                                           rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource* mr);
+                                                          rmm::device_async_resource_ref mr);
 
 template std::string type_to_name<string_type>(string_column_buffer const& buffer);
 template std::string type_to_name<pointer_type>(pointer_column_buffer const& buffer);
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 57ee1043ee9..ace1396bc09 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -31,6 +31,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/pair.h>
 
@@ -50,7 +52,7 @@ namespace detail {
 inline rmm::device_buffer create_data(data_type type,
                                       size_type size,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   std::size_t data_size = size_of(type) * size;
 
@@ -96,7 +98,7 @@ class column_buffer_base {
                      size_type _size,
                      bool _is_nullable,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr)
+                     rmm::device_async_resource_ref mr)
     : column_buffer_base(_type, _is_nullable)
   {
   }
@@ -111,7 +113,7 @@ class column_buffer_base {
 
   // instantiate a column of known type with a specified size.  Allows deferred creation for
   // preprocessing steps such as in the Parquet reader
-  void create(size_type _size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+  void create(size_type _size, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
   // Create a new column_buffer that has empty data but with the same basic information as the
   // input column, including same type, nullability, name, and user_data.
@@ -140,7 +142,7 @@ class column_buffer_base {
   rmm::device_buffer _data{};
   rmm::device_buffer _null_mask{};
   size_type _null_count{0};
-  rmm::mr::device_memory_resource* _mr;
+  rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
 
  public:
   data_type type{type_id::EMPTY};
@@ -174,7 +176,7 @@ class gather_column_buffer : public column_buffer_base<gather_column_buffer> {
                        size_type _size,
                        bool _is_nullable,
                        rmm::cuda_stream_view stream,
-                       rmm::mr::device_memory_resource* mr)
+                       rmm::device_async_resource_ref mr)
     : column_buffer_base<gather_column_buffer>(_type, _size, _is_nullable, stream, mr)
   {
     create(_size, stream, mr);
@@ -208,7 +210,7 @@ class inline_column_buffer : public column_buffer_base<inline_column_buffer> {
                        size_type _size,
                        bool _is_nullable,
                        rmm::cuda_stream_view stream,
-                       rmm::mr::device_memory_resource* mr)
+                       rmm::device_async_resource_ref mr)
     : column_buffer_base<inline_column_buffer>(_type, _size, _is_nullable, stream, mr)
   {
     create(_size, stream, mr);
@@ -251,7 +253,7 @@ template <class string_policy>
 std::unique_ptr<column> empty_like(column_buffer_base<string_policy>& buffer,
                                    column_name_info* schema_info,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 /**
  * @brief Given a column_buffer, produce a formatted name string describing the type.
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 4b5d47e71fb..c9e507925ec 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <thrust/copy.h>
@@ -796,7 +797,7 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
                                             rmm::device_scalar<size_type>& d_null_count,
                                             cudf::io::parse_options_view const& options,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   //  CUDF_FUNC_RANGE();
 
@@ -914,7 +915,7 @@ std::unique_ptr<column> parse_data(
   size_type null_count,
   cudf::io::parse_options_view const& options,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/io/utilities/output_builder.cuh b/cpp/src/io/utilities/output_builder.cuh
index 1858912a871..a7517983cd3 100644
--- a/cpp/src/io/utilities/output_builder.cuh
+++ b/cpp/src/io/utilities/output_builder.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 
@@ -206,7 +207,7 @@ class output_builder {
   output_builder(size_type max_write_size,
                  size_type max_growth,
                  rmm::cuda_stream_view stream,
-                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
     : _size{0}, _max_write_size{max_write_size}, _max_growth{max_growth}
   {
     CUDF_EXPECTS(max_write_size > 0, "Internal error");
@@ -307,7 +308,7 @@ class output_builder {
    * @return The output vector.
    */
   rmm::device_uvector<T> gather(rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr) const
+                                rmm::device_async_resource_ref mr) const
   {
     rmm::device_uvector<T> output{size(), stream, mr};
     auto output_it = output.begin();
diff --git a/cpp/src/io/utilities/string_parsing.hpp b/cpp/src/io/utilities/string_parsing.hpp
index a98660c98a9..612889af74b 100644
--- a/cpp/src/io/utilities/string_parsing.hpp
+++ b/cpp/src/io/utilities/string_parsing.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/tuple.h>
@@ -74,6 +75,6 @@ std::unique_ptr<column> parse_data(
   size_type null_count,
   cudf::io::parse_options_view const& options,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 }  // namespace json::detail
 }  // namespace cudf::io
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index cc729ad5e8b..095093d08e5 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -30,6 +30,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
@@ -44,7 +45,7 @@ conditional_join(table_view const& left,
                  join_kind join_type,
                  std::optional<std::size_t> output_size,
                  rmm::cuda_stream_view stream,
-                 rmm::mr::device_memory_resource* mr)
+                 rmm::device_async_resource_ref mr)
 {
   // We can immediately filter out cases where the right table is empty. In
   // some cases, we return all the rows of the left table with a corresponding
@@ -197,7 +198,7 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
                                                  ast::expression const& binary_predicate,
                                                  join_kind join_type,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   // Until we add logic to handle the number of non-matches in the right table,
   // full joins are not supported in this function. Note that this does not
@@ -293,7 +294,7 @@ conditional_inner_join(table_view const& left,
                        table_view const& right,
                        ast::expression const& binary_predicate,
                        std::optional<std::size_t> output_size,
-                       rmm::mr::device_memory_resource* mr)
+                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::conditional_join(left,
@@ -311,7 +312,7 @@ conditional_left_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
                       std::optional<std::size_t> output_size,
-                      rmm::mr::device_memory_resource* mr)
+                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::conditional_join(left,
@@ -328,7 +329,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 conditional_full_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
-                      rmm::mr::device_memory_resource* mr)
+                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::conditional_join(left,
@@ -345,7 +346,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return std::move(detail::conditional_join(left,
@@ -363,7 +364,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return std::move(detail::conditional_join(left,
@@ -379,7 +380,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
 std::size_t conditional_inner_join_size(table_view const& left,
                                         table_view const& right,
                                         ast::expression const& binary_predicate,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::compute_conditional_join_output_size(
@@ -389,7 +390,7 @@ std::size_t conditional_inner_join_size(table_view const& left,
 std::size_t conditional_left_join_size(table_view const& left,
                                        table_view const& right,
                                        ast::expression const& binary_predicate,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::compute_conditional_join_output_size(
@@ -399,7 +400,7 @@ std::size_t conditional_left_join_size(table_view const& left,
 std::size_t conditional_left_semi_join_size(table_view const& left,
                                             table_view const& right,
                                             ast::expression const& binary_predicate,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return std::move(detail::compute_conditional_join_output_size(left,
@@ -413,7 +414,7 @@ std::size_t conditional_left_semi_join_size(table_view const& left,
 std::size_t conditional_left_anti_join_size(table_view const& left,
                                             table_view const& right,
                                             ast::expression const& binary_predicate,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return std::move(detail::compute_conditional_join_output_size(left,
diff --git a/cpp/src/join/conditional_join.hpp b/cpp/src/join/conditional_join.hpp
index 9bc6024ee7e..06eb83d6ba8 100644
--- a/cpp/src/join/conditional_join.hpp
+++ b/cpp/src/join/conditional_join.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
@@ -49,7 +50,7 @@ conditional_join(table_view const& left,
                  join_kind JoinKind,
                  std::optional<std::size_t> output_size,
                  rmm::cuda_stream_view stream,
-                 rmm::mr::device_memory_resource* mr);
+                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes the size of a join operation between two tables without
@@ -68,7 +69,7 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
                                                  ast::expression const& binary_predicate,
                                                  join_kind JoinKind,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr);
+                                                 rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/join/cross_join.cu b/cpp/src/join/cross_join.cu
index 07057acf37e..a2ee3a7796b 100644
--- a/cpp/src/join/cross_join.cu
+++ b/cpp/src/join/cross_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -40,7 +41,7 @@ namespace detail {
 std::unique_ptr<cudf::table> cross_join(cudf::table_view const& left,
                                         cudf::table_view const& right,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(0 != left.num_columns(), "Left table is empty");
   CUDF_EXPECTS(0 != right.num_columns(), "Right table is empty");
@@ -74,7 +75,7 @@ std::unique_ptr<cudf::table> cross_join(cudf::table_view const& left,
 
 std::unique_ptr<cudf::table> cross_join(cudf::table_view const& left,
                                         cudf::table_view const& right,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::cross_join(left, right, cudf::get_default_stream(), mr);
diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index 8bd42d867a3..a3652942973 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -28,6 +28,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cooperative_groups.h>
 #include <cub/block/block_scan.cuh>
@@ -309,7 +310,7 @@ template <cudf::has_nested HasNested>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr) const
+                                          rmm::device_async_resource_ref mr) const
 {
   cudf::scoped_range range{"distinct_hash_join::inner_join"};
 
@@ -352,7 +353,7 @@ distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
 
 template <cudf::has_nested HasNested>
 std::unique_ptr<rmm::device_uvector<size_type>> distinct_hash_join<HasNested>::left_join(
-  rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const
+  rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
 {
   cudf::scoped_range range{"distinct_hash_join::left_join"};
 
@@ -419,7 +420,7 @@ template <>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 distinct_hash_join<cudf::has_nested::YES>::inner_join(rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr) const
+                                                      rmm::device_async_resource_ref mr) const
 {
   return _impl->inner_join(stream, mr);
 }
@@ -428,7 +429,7 @@ template <>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 distinct_hash_join<cudf::has_nested::NO>::inner_join(rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr) const
+                                                     rmm::device_async_resource_ref mr) const
 {
   return _impl->inner_join(stream, mr);
 }
@@ -436,14 +437,14 @@ distinct_hash_join<cudf::has_nested::NO>::inner_join(rmm::cuda_stream_view strea
 template <>
 std::unique_ptr<rmm::device_uvector<size_type>>
 distinct_hash_join<cudf::has_nested::YES>::left_join(rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr) const
+                                                     rmm::device_async_resource_ref mr) const
 {
   return _impl->left_join(stream, mr);
 }
 
 template <>
 std::unique_ptr<rmm::device_uvector<size_type>> distinct_hash_join<cudf::has_nested::NO>::left_join(
-  rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const
+  rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
 {
   return _impl->left_join(stream, mr);
 }
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 17616818a58..fbe16378e8c 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -26,6 +26,7 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/count.h>
 #include <thrust/functional.h>
@@ -157,7 +158,7 @@ probe_join_hash_table(
   null_equality compare_nulls,
   std::optional<std::size_t> output_size,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   // Use the output size directly if provided. Otherwise, compute the exact output size
   auto const probe_join_type =
@@ -267,7 +268,7 @@ std::size_t get_full_join_size(
   bool has_nulls,
   null_equality compare_nulls,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   std::size_t join_size = compute_join_output_size(build_table,
                                                    probe_table,
@@ -396,7 +397,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 hash_join<Hasher>::inner_join(cudf::table_view const& probe,
                               std::optional<std::size_t> output_size,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr) const
+                              rmm::device_async_resource_ref mr) const
 {
   CUDF_FUNC_RANGE();
   return compute_hash_join(probe, cudf::detail::join_kind::INNER_JOIN, output_size, stream, mr);
@@ -408,7 +409,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 hash_join<Hasher>::left_join(cudf::table_view const& probe,
                              std::optional<std::size_t> output_size,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr) const
+                             rmm::device_async_resource_ref mr) const
 {
   CUDF_FUNC_RANGE();
   return compute_hash_join(probe, cudf::detail::join_kind::LEFT_JOIN, output_size, stream, mr);
@@ -420,7 +421,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 hash_join<Hasher>::full_join(cudf::table_view const& probe,
                              std::optional<std::size_t> output_size,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr) const
+                             rmm::device_async_resource_ref mr) const
 {
   CUDF_FUNC_RANGE();
   return compute_hash_join(probe, cudf::detail::join_kind::FULL_JOIN, output_size, stream, mr);
@@ -481,7 +482,7 @@ std::size_t hash_join<Hasher>::left_join_size(cudf::table_view const& probe,
 template <typename Hasher>
 std::size_t hash_join<Hasher>::full_join_size(cudf::table_view const& probe,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr) const
+                                              rmm::device_async_resource_ref mr) const
 {
   CUDF_FUNC_RANGE();
 
@@ -512,7 +513,7 @@ hash_join<Hasher>::probe_join_indices(cudf::table_view const& probe_table,
                                       cudf::detail::join_kind join,
                                       std::optional<std::size_t> output_size,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr) const
+                                      rmm::device_async_resource_ref mr) const
 {
   // Trivial left join case - exit early
   if (_is_empty and join != cudf::detail::join_kind::INNER_JOIN) {
@@ -553,7 +554,7 @@ hash_join<Hasher>::compute_hash_join(cudf::table_view const& probe,
                                      cudf::detail::join_kind join,
                                      std::optional<std::size_t> output_size,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
 {
   CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty");
 
@@ -603,7 +604,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 hash_join::inner_join(cudf::table_view const& probe,
                       std::optional<std::size_t> output_size,
                       rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr) const
+                      rmm::device_async_resource_ref mr) const
 {
   return _impl->inner_join(probe, output_size, stream, mr);
 }
@@ -613,7 +614,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 hash_join::left_join(cudf::table_view const& probe,
                      std::optional<std::size_t> output_size,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr) const
+                     rmm::device_async_resource_ref mr) const
 {
   return _impl->left_join(probe, output_size, stream, mr);
 }
@@ -623,7 +624,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 hash_join::full_join(cudf::table_view const& probe,
                      std::optional<std::size_t> output_size,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr) const
+                     rmm::device_async_resource_ref mr) const
 {
   return _impl->full_join(probe, output_size, stream, mr);
 }
@@ -642,7 +643,7 @@ std::size_t hash_join::left_join_size(cudf::table_view const& probe,
 
 std::size_t hash_join::full_join_size(cudf::table_view const& probe,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr) const
+                                      rmm::device_async_resource_ref mr) const
 {
   return _impl->full_join_size(probe, stream, mr);
 }
diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu
index ae025b1a213..bc7f09763ec 100644
--- a/cpp/src/join/join.cu
+++ b/cpp/src/join/join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -33,7 +34,7 @@ inner_join(table_view const& left_input,
            table_view const& right_input,
            null_equality compare_nulls,
            rmm::cuda_stream_view stream,
-           rmm::mr::device_memory_resource* mr)
+           rmm::device_async_resource_ref mr)
 {
   // Make sure any dictionary columns have matched key sets.
   // This will return any new dictionary columns created as well as updated table_views.
@@ -68,7 +69,7 @@ left_join(table_view const& left_input,
           table_view const& right_input,
           null_equality compare_nulls,
           rmm::cuda_stream_view stream,
-          rmm::mr::device_memory_resource* mr)
+          rmm::device_async_resource_ref mr)
 {
   // Make sure any dictionary columns have matched key sets.
   // This will return any new dictionary columns created as well as updated table_views.
@@ -93,7 +94,7 @@ full_join(table_view const& left_input,
           table_view const& right_input,
           null_equality compare_nulls,
           rmm::cuda_stream_view stream,
-          rmm::mr::device_memory_resource* mr)
+          rmm::device_async_resource_ref mr)
 {
   // Make sure any dictionary columns have matched key sets.
   // This will return any new dictionary columns created as well as updated table_views.
@@ -119,7 +120,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 inner_join(table_view const& left,
            table_view const& right,
            null_equality compare_nulls,
-           rmm::mr::device_memory_resource* mr)
+           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::inner_join(left, right, compare_nulls, cudf::get_default_stream(), mr);
@@ -130,7 +131,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 left_join(table_view const& left,
           table_view const& right,
           null_equality compare_nulls,
-          rmm::mr::device_memory_resource* mr)
+          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::left_join(left, right, compare_nulls, cudf::get_default_stream(), mr);
@@ -141,7 +142,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 full_join(table_view const& left,
           table_view const& right,
           null_equality compare_nulls,
-          rmm::mr::device_memory_resource* mr)
+          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::full_join(left, right, compare_nulls, cudf::get_default_stream(), mr);
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index 9da41e296e6..9758919c5b4 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -24,6 +24,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <thrust/iterator/counting_iterator.h>
@@ -146,7 +147,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 get_trivial_left_join_indices(table_view const& left,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 /**
  * @brief Builds the hash table based on the given `build_table`.
@@ -245,7 +246,7 @@ get_left_join_indices_complement(std::unique_ptr<rmm::device_uvector<size_type>>
                                  size_type left_table_row_count,
                                  size_type right_table_row_count,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Device functor to determine if an index is contained in a range.
diff --git a/cpp/src/join/join_utils.cu b/cpp/src/join/join_utils.cu
index 7fa6642b19f..8d916da9f2c 100644
--- a/cpp/src/join/join_utils.cu
+++ b/cpp/src/join/join_utils.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include "join_common_utils.cuh"
 
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/functional.h>
@@ -53,7 +54,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 get_trivial_left_join_indices(table_view const& left,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
   thrust::sequence(rmm::exec_policy(stream), left_indices->begin(), left_indices->end(), 0);
@@ -93,7 +94,7 @@ get_left_join_indices_complement(std::unique_ptr<rmm::device_uvector<size_type>>
                                  size_type left_table_row_count,
                                  size_type right_table_row_count,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   // Get array of indices that do not appear in right_indices
 
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index 6223114fcd0..42e0e4f45ee 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -32,6 +32,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/scan.h>
@@ -54,7 +55,7 @@ mixed_join(
   join_kind join_type,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> const& output_size_data,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(left_conditional.num_rows() == left_equality.num_rows(),
                "The left conditional and equality tables must have the same number of rows.");
@@ -304,7 +305,7 @@ compute_mixed_join_output_size(table_view const& left_equality,
                                null_equality compare_nulls,
                                join_kind join_type,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   // Until we add logic to handle the number of non-matches in the right table,
   // full joins are not supported in this function. Note that this does not
@@ -483,7 +484,7 @@ mixed_inner_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> const output_size_data,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::mixed_join(left_equality,
@@ -505,7 +506,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_in
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::compute_mixed_join_output_size(left_equality,
@@ -529,7 +530,7 @@ mixed_left_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> const output_size_data,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::mixed_join(left_equality,
@@ -551,7 +552,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::compute_mixed_join_output_size(left_equality,
@@ -575,7 +576,7 @@ mixed_full_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> const output_size_data,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::mixed_join(left_equality,
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index d654f580cad..8500b248fcf 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -34,6 +34,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -93,7 +94,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   null_equality compare_nulls,
   join_kind join_type,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) &&
                  (join_type != join_kind::FULL_JOIN),
@@ -279,7 +280,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::mixed_join_semi(left_equality,
@@ -300,7 +301,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::mixed_join_semi(left_equality,
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index b0e5282d97f..91d98d5e8d3 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -29,6 +29,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -47,7 +48,7 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
   cudf::table_view const& right_keys,
   null_equality compare_nulls,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(0 != left_keys.num_columns(), "Left table is empty");
   CUDF_EXPECTS(0 != right_keys.num_columns(), "Right table is empty");
@@ -97,7 +98,7 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::left_semi_anti_join(
@@ -108,7 +109,7 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_anti_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::left_semi_anti_join(
diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu
index ff42d9c8620..d1a1097de35 100644
--- a/cpp/src/json/json_path.cu
+++ b/cpp/src/json/json_path.cu
@@ -37,6 +37,7 @@
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/optional.h>
 #include <thrust/pair.h>
@@ -977,7 +978,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
                                               cudf::string_scalar const& json_path,
                                               get_json_object_options options,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   // preprocess the json_path into a command buffer
   auto preprocess = build_command_buffer(json_path, stream);
@@ -1062,7 +1063,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
                                               cudf::string_scalar const& json_path,
                                               get_json_object_options options,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::get_json_object(col, json_path, options, stream, mr);
diff --git a/cpp/src/labeling/label_bins.cu b/cpp/src/labeling/label_bins.cu
index 9fecaa1ddb2..1bfa7f39190 100644
--- a/cpp/src/labeling/label_bins.cu
+++ b/cpp/src/labeling/label_bins.cu
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/advance.h>
 #include <thrust/binary_search.h>
@@ -110,7 +111,7 @@ std::unique_ptr<column> label_bins(column_view const& input,
                                    column_view const& left_edges,
                                    column_view const& right_edges,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   auto output = make_numeric_column(
     data_type(type_to_id<size_type>()), input.size(), mask_state::UNALLOCATED, stream, mr);
@@ -176,7 +177,7 @@ struct bin_type_dispatcher {
     column_view const& right_edges,
     inclusive right_inclusive,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     if ((left_inclusive == inclusive::YES) && (right_inclusive == inclusive::YES))
       return label_bins<T, thrust::less_equal<T>, thrust::less_equal<T>>(
@@ -204,7 +205,7 @@ std::unique_ptr<column> label_bins(column_view const& input,
                                    column_view const& right_edges,
                                    inclusive right_inclusive,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE()
   CUDF_EXPECTS((input.type() == left_edges.type()) && (input.type() == right_edges.type()),
@@ -237,7 +238,7 @@ std::unique_ptr<column> label_bins(column_view const& input,
                                    column_view const& right_edges,
                                    inclusive right_inclusive,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::label_bins(
diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
index 579ad8e7dff..58ec053712d 100644
--- a/cpp/src/lists/combine/concatenate_list_elements.cu
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/execution_policy.h>
@@ -52,7 +53,7 @@ namespace {
 std::unique_ptr<column> concatenate_lists_ignore_null(column_view const& input,
                                                       bool build_null_mask,
                                                       rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr)
+                                                      rmm::device_async_resource_ref mr)
 {
   auto const num_rows = input.size();
 
@@ -119,7 +120,7 @@ std::unique_ptr<column> concatenate_lists_ignore_null(column_view const& input,
 std::pair<std::unique_ptr<column>, rmm::device_uvector<int8_t>>
 generate_list_offsets_and_validities(column_view const& input,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   auto const num_rows = input.size();
 
@@ -174,7 +175,7 @@ std::unique_ptr<column> gather_list_entries(column_view const& input,
                                             size_type num_rows,
                                             size_type num_output_entries,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   auto const child_col      = lists_column_view(input).child();
   auto const entry_col      = lists_column_view(child_col).child();
@@ -213,7 +214,7 @@ std::unique_ptr<column> gather_list_entries(column_view const& input,
 
 std::unique_ptr<column> concatenate_lists_nullifying_rows(column_view const& input,
                                                           rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource* mr)
+                                                          rmm::device_async_resource_ref mr)
 {
   // Generate offsets and validities of the output lists column.
   auto [list_offsets, list_validities] = generate_list_offsets_and_validities(input, stream, mr);
@@ -247,7 +248,7 @@ std::unique_ptr<column> concatenate_lists_nullifying_rows(column_view const& inp
 std::unique_ptr<column> concatenate_list_elements(column_view const& input,
                                                   concatenate_null_policy null_policy,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type().id() == type_id::LIST,
                "Input column must be a lists column.",
@@ -274,7 +275,7 @@ std::unique_ptr<column> concatenate_list_elements(column_view const& input,
 std::unique_ptr<column> concatenate_list_elements(column_view const& input,
                                                   concatenate_null_policy null_policy,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::concatenate_list_elements(input, null_policy, stream, mr);
diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu
index baecef3b92d..38d299763a1 100644
--- a/cpp/src/lists/combine/concatenate_rows.cu
+++ b/cpp/src/lists/combine/concatenate_rows.cu
@@ -26,6 +26,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -75,7 +76,7 @@ generate_regrouped_offsets_and_null_mask(table_device_view const& input,
                                          concatenate_null_policy null_policy,
                                          device_span<size_type const> row_null_counts,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   // outgoing offsets.
   auto offsets = cudf::make_fixed_width_column(
@@ -194,7 +195,7 @@ rmm::device_uvector<size_type> generate_null_counts(table_device_view const& inp
 std::unique_ptr<column> concatenate_rows(table_view const& input,
                                          concatenate_null_policy null_policy,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.num_columns() > 0, "The input table must have at least one column.");
 
@@ -314,7 +315,7 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
 std::unique_ptr<column> concatenate_rows(table_view const& input,
                                          concatenate_null_policy null_policy,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::concatenate_rows(input, null_policy, stream, mr);
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index 378cf678f1f..4737b077deb 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -30,6 +30,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/execution_policy.h>
@@ -184,7 +185,7 @@ std::unique_ptr<column> dispatch_index_of(lists_column_view const& lists,
                                           column_view const& search_keys,
                                           duplicate_find_option find_option,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(cudf::type_dispatcher(search_keys.type(), is_supported_type_fn{}),
                "Unsupported type in `dispatch_index_of` function.");
@@ -245,7 +246,7 @@ std::unique_ptr<column> dispatch_index_of(lists_column_view const& lists,
  */
 std::unique_ptr<column> to_contains(std::unique_ptr<column>&& key_positions,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(key_positions->type().id() == type_to_id<size_type>(),
                "Expected input column of type cudf::size_type.");
@@ -274,7 +275,7 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  cudf::scalar const& search_key,
                                  duplicate_find_option find_option,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   if (!search_key.is_valid(stream)) {
     return make_numeric_column(
@@ -298,7 +299,7 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  column_view const& search_keys,
                                  duplicate_find_option find_option,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(search_keys.size() == lists.size(),
                "Number of search keys must match list column size.");
@@ -308,7 +309,7 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
 std::unique_ptr<column> contains(lists_column_view const& lists,
                                  cudf::scalar const& search_key,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   auto key_indices = detail::index_of(lists,
                                       search_key,
@@ -321,7 +322,7 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
 std::unique_ptr<column> contains(lists_column_view const& lists,
                                  column_view const& search_keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(search_keys.size() == lists.size(),
                "Number of search keys must match list column size.");
@@ -336,7 +337,7 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
 
 std::unique_ptr<column> contains_nulls(lists_column_view const& lists,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto const lists_cv      = lists.parent();
   auto output              = make_numeric_column(data_type{type_to_id<bool>()},
@@ -370,7 +371,7 @@ std::unique_ptr<column> contains_nulls(lists_column_view const& lists,
 std::unique_ptr<column> contains(lists_column_view const& lists,
                                  cudf::scalar const& search_key,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::contains(lists, search_key, stream, mr);
@@ -379,7 +380,7 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
 std::unique_ptr<column> contains(lists_column_view const& lists,
                                  column_view const& search_keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::contains(lists, search_keys, stream, mr);
@@ -387,7 +388,7 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
 
 std::unique_ptr<column> contains_nulls(lists_column_view const& lists,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::contains_nulls(lists, stream, mr);
@@ -397,7 +398,7 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  cudf::scalar const& search_key,
                                  duplicate_find_option find_option,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::index_of(lists, search_key, find_option, stream, mr);
@@ -407,7 +408,7 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  column_view const& search_keys,
                                  duplicate_find_option find_option,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::index_of(lists, search_keys, find_option, stream, mr);
diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu
index 5407b88236f..3d609a262b9 100644
--- a/cpp/src/lists/copying/concatenate.cu
+++ b/cpp/src/lists/copying/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
@@ -56,7 +57,7 @@ namespace {
 std::unique_ptr<column> merge_offsets(host_span<lists_column_view const> columns,
                                       size_type total_list_count,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   // outgoing offsets
   auto merged_offsets = cudf::make_fixed_width_column(
@@ -96,7 +97,7 @@ std::unique_ptr<column> merge_offsets(host_span<lists_column_view const> columns
  */
 std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   std::vector<lists_column_view> lists_columns;
   lists_columns.reserve(columns.size());
diff --git a/cpp/src/lists/copying/copying.cu b/cpp/src/lists/copying/copying.cu
index 2d3826c8004..162c6140656 100644
--- a/cpp/src/lists/copying/copying.cu
+++ b/cpp/src/lists/copying/copying.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -38,7 +39,7 @@ std::unique_ptr<cudf::column> copy_slice(lists_column_view const& lists,
                                          size_type start,
                                          size_type end,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   if (lists.is_empty() or start == end) { return cudf::empty_like(lists.parent()); }
   if (end < 0 || end > lists.size()) end = lists.size();
diff --git a/cpp/src/lists/copying/gather.cu b/cpp/src/lists/copying/gather.cu
index bd270b69656..cadeb273a65 100644
--- a/cpp/src/lists/copying/gather.cu
+++ b/cpp/src/lists/copying/gather.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/lists/detail/gather.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -92,7 +93,7 @@ struct list_gatherer {
 std::unique_ptr<column> gather_list_leaf(column_view const& column,
                                          gather_data const& gd,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   // gather map iterator for this level (N)
   auto gather_map_begin = thrust::make_transform_iterator(
@@ -121,7 +122,7 @@ std::unique_ptr<column> gather_list_leaf(column_view const& column,
 std::unique_ptr<column> gather_list_nested(cudf::lists_column_view const& list,
                                            gather_data& gd,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   // gather map iterator for this level (N)
   auto gather_map_begin = thrust::make_transform_iterator(
diff --git a/cpp/src/lists/copying/scatter_helper.cu b/cpp/src/lists/copying/scatter_helper.cu
index 1ec66b4f98e..b754fef24e5 100644
--- a/cpp/src/lists/copying/scatter_helper.cu
+++ b/cpp/src/lists/copying/scatter_helper.cu
@@ -23,6 +23,8 @@
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda/functional>
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -54,7 +56,7 @@ std::pair<rmm::device_buffer, size_type> construct_child_nullmask(
   cudf::detail::lists_column_device_view const& target_lists,
   size_type num_child_rows,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto is_valid_predicate = [d_list_vector  = parent_list_vector.begin(),
                              d_offsets      = parent_list_offsets.template data<size_type>(),
@@ -160,7 +162,7 @@ struct list_child_constructor {
     cudf::lists_column_view const& source_lists_column_view,
     cudf::lists_column_view const& target_lists_column_view,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
+    rmm::device_async_resource_ref mr) const
   {
     auto source_column_device_view =
       column_device_view::create(source_lists_column_view.parent(), stream);
@@ -219,7 +221,7 @@ struct list_child_constructor {
     cudf::lists_column_view const& source_lists_column_view,
     cudf::lists_column_view const& target_lists_column_view,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
+    rmm::device_async_resource_ref mr) const
   {
     auto source_column_device_view =
       column_device_view::create(source_lists_column_view.parent(), stream);
@@ -282,7 +284,7 @@ struct list_child_constructor {
     cudf::lists_column_view const& source_lists_column_view,
     cudf::lists_column_view const& target_lists_column_view,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
+    rmm::device_async_resource_ref mr) const
   {
     auto source_column_device_view =
       column_device_view::create(source_lists_column_view.parent(), stream);
@@ -378,7 +380,7 @@ struct list_child_constructor {
     cudf::lists_column_view const& source_lists_column_view,
     cudf::lists_column_view const& target_lists_column_view,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
+    rmm::device_async_resource_ref mr) const
   {
     auto const source_column_device_view =
       column_device_view::create(source_lists_column_view.parent(), stream);
@@ -468,7 +470,7 @@ std::unique_ptr<column> build_lists_child_column_recursive(
   cudf::lists_column_view const& source_lists_column_view,
   cudf::lists_column_view const& target_lists_column_view,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   return cudf::type_dispatcher<dispatch_storage_type>(child_column_type,
                                                       list_child_constructor{},
diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu
index 156f868c5bd..89b1a126fc5 100644
--- a/cpp/src/lists/copying/segmented_gather.cu
+++ b/cpp/src/lists/copying/segmented_gather.cu
@@ -22,6 +22,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -35,7 +36,7 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,
                                          lists_column_view const& gather_map,
                                          out_of_bounds_policy bounds_policy,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_index_type(gather_map.child().type()),
                "Gather map should be list column of index type");
@@ -120,7 +121,7 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& source_column,
                                          lists_column_view const& gather_map_list,
                                          out_of_bounds_policy bounds_policy,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_gather(source_column, gather_map_list, bounds_policy, stream, mr);
diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu
index 2fd0851067a..19c434d10e1 100644
--- a/cpp/src/lists/count_elements.cu
+++ b/cpp/src/lists/count_elements.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -45,7 +46,7 @@ namespace detail {
  */
 std::unique_ptr<column> count_elements(lists_column_view const& input,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto device_column = cudf::column_device_view::create(input.parent(), stream);
   auto d_column      = *device_column;
@@ -74,7 +75,7 @@ std::unique_ptr<column> count_elements(lists_column_view const& input,
 
 std::unique_ptr<column> count_elements(lists_column_view const& input,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::count_elements(input, stream, mr);
diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu
index 5f1d30321a2..370d7480578 100644
--- a/cpp/src/lists/explode.cu
+++ b/cpp/src/lists/explode.cu
@@ -26,6 +26,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/advance.h>
@@ -59,7 +60,7 @@ std::unique_ptr<table> build_table(
   thrust::optional<cudf::device_span<size_type const>> explode_col_gather_map,
   thrust::optional<rmm::device_uvector<size_type>> position_array,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto select_iter = thrust::make_transform_iterator(
     thrust::make_counting_iterator(0),
@@ -113,7 +114,7 @@ std::unique_ptr<table> build_table(
 std::unique_ptr<table> explode(table_view const& input_table,
                                size_type const explode_column_idx,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   lists_column_view explode_col{input_table.column(explode_column_idx)};
   auto sliced_child = explode_col.get_sliced_child(stream);
@@ -151,7 +152,7 @@ std::unique_ptr<table> explode(table_view const& input_table,
 std::unique_ptr<table> explode_position(table_view const& input_table,
                                         size_type const explode_column_idx,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   lists_column_view explode_col{input_table.column(explode_column_idx)};
   auto sliced_child = explode_col.get_sliced_child(stream);
@@ -202,7 +203,7 @@ std::unique_ptr<table> explode_outer(table_view const& input_table,
                                      size_type const explode_column_idx,
                                      bool include_position,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   lists_column_view explode_col{input_table.column(explode_column_idx)};
   auto sliced_child  = explode_col.get_sliced_child(stream);
@@ -299,11 +300,11 @@ std::unique_ptr<table> explode_outer(table_view const& input_table,
 }  // namespace detail
 
 /**
- * @copydoc cudf::explode(table_view const&, size_type, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::explode(table_view const&, size_type, rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> explode(table_view const& input_table,
                                size_type explode_column_idx,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
@@ -312,11 +313,11 @@ std::unique_ptr<table> explode(table_view const& input_table,
 }
 
 /**
- * @copydoc cudf::explode_position(table_view const&, size_type, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::explode_position(table_view const&, size_type, rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> explode_position(table_view const& input_table,
                                         size_type explode_column_idx,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
@@ -325,11 +326,11 @@ std::unique_ptr<table> explode_position(table_view const& input_table,
 }
 
 /**
- * @copydoc cudf::explode_outer(table_view const&, size_type, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::explode_outer(table_view const&, size_type, rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> explode_outer(table_view const& input_table,
                                      size_type explode_column_idx,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
@@ -340,11 +341,11 @@ std::unique_ptr<table> explode_outer(table_view const& input_table,
 
 /**
  * @copydoc cudf::explode_outer_position(table_view const&, size_type,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> explode_outer_position(table_view const& input_table,
                                               size_type explode_column_idx,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu
index 365e9ef8255..c0ce86fb56e 100644
--- a/cpp/src/lists/extract.cu
+++ b/cpp/src/lists/extract.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/fill.h>
@@ -118,7 +119,7 @@ template <typename index_t>
 std::unique_ptr<column> extract_list_element_impl(lists_column_view lists_column,
                                                   index_t const& index,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   auto const num_lists = lists_column.size();
   if (num_lists == 0) { return empty_like(lists_column.child()); }
@@ -174,7 +175,7 @@ std::unique_ptr<column> extract_list_element_impl(lists_column_view lists_column
 std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
                                              size_type const index,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   return detail::extract_list_element_impl(lists_column, index, stream, mr);
 }
@@ -182,7 +183,7 @@ std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
 std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
                                              column_view const& indices,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   return detail::extract_list_element_impl(lists_column, indices, stream, mr);
 }
@@ -192,12 +193,12 @@ std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
 /**
  * @copydoc cudf::lists::extract_list_element(lists_column_view const&,
  *                                            size_type,
- *                                            rmm::mr::device_memory_resource*)
+ *                                            rmm::device_async_resource_ref)
  */
 std::unique_ptr<column> extract_list_element(lists_column_view const& lists_column,
                                              size_type index,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_list_element(lists_column, index, stream, mr);
@@ -206,12 +207,12 @@ std::unique_ptr<column> extract_list_element(lists_column_view const& lists_colu
 /**
  * @copydoc cudf::lists::extract_list_element(lists_column_view const&,
  *                                            column_view const&,
- *                                            rmm::mr::device_memory_resource*)
+ *                                            rmm::device_async_resource_ref)
  */
 std::unique_ptr<column> extract_list_element(lists_column_view const& lists_column,
                                              column_view const& indices,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(indices.size() == lists_column.size(),
diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu
index 478b6c9a209..88eccf13f72 100644
--- a/cpp/src/lists/interleave_columns.cu
+++ b/cpp/src/lists/interleave_columns.cu
@@ -29,6 +29,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/copy.h>
@@ -51,7 +52,7 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<int8_t>>
 generate_list_offsets_and_validities(table_view const& input,
                                      bool has_null_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   auto const num_cols         = input.num_columns();
   auto const num_rows         = input.num_rows();
@@ -99,7 +100,7 @@ generate_list_offsets_and_validities(table_view const& input,
  */
 std::unique_ptr<column> concatenate_and_gather_lists(host_span<column_view const> columns_to_concat,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   // Concatenate all columns into a single (temporary) column.
   auto const concatenated_col =
@@ -218,7 +219,7 @@ struct interleave_list_entries_impl<T, std::enable_if_t<std::is_same_v<T, cudf::
                                      size_type num_output_entries,
                                      bool data_has_null_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const noexcept
+                                     rmm::device_async_resource_ref mr) const noexcept
   {
     auto const table_dv_ptr = table_device_view::create(input, stream);
     auto comp_fn            = compute_string_sizes_and_interleave_lists_fn{
@@ -250,7 +251,7 @@ struct interleave_list_entries_impl<T, std::enable_if_t<cudf::is_fixed_width<T>(
                                      size_type num_output_entries,
                                      bool data_has_null_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const noexcept
+                                     rmm::device_async_resource_ref mr) const noexcept
   {
     auto const num_cols     = input.num_columns();
     auto const table_dv_ptr = table_device_view::create(input, stream);
@@ -329,7 +330,7 @@ struct interleave_list_entries_fn {
                                      size_type num_output_entries,
                                      bool data_has_null_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     return interleave_list_entries_impl<T>{}(input,
                                              output_list_offsets,
@@ -350,7 +351,7 @@ struct interleave_list_entries_fn {
 std::unique_ptr<column> interleave_columns(table_view const& input,
                                            bool has_null_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto const entry_type = lists_column_view(*input.begin()).child().type();
   for (auto const& col : input) {
diff --git a/cpp/src/lists/lists_column_factories.cu b/cpp/src/lists/lists_column_factories.cu
index 278e5af07b2..66ad1c35c33 100644
--- a/cpp/src/lists/lists_column_factories.cu
+++ b/cpp/src/lists/lists_column_factories.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/sequence.h>
@@ -36,7 +37,7 @@ namespace detail {
 std::unique_ptr<cudf::column> make_lists_column_from_scalar(list_scalar const& value,
                                                             size_type size,
                                                             rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource* mr)
+                                                            rmm::device_async_resource_ref mr)
 {
   if (size == 0) {
     return make_lists_column(0,
@@ -84,7 +85,7 @@ std::unique_ptr<cudf::column> make_lists_column_from_scalar(list_scalar const& v
 
 std::unique_ptr<column> make_empty_lists_column(data_type child_type,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   auto offsets = make_empty_column(data_type(type_to_id<size_type>()));
   auto child   = make_empty_column(child_type);
@@ -95,7 +96,7 @@ std::unique_ptr<column> make_empty_lists_column(data_type child_type,
 std::unique_ptr<column> make_all_nulls_lists_column(size_type size,
                                                     data_type child_type,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   auto offsets = [&] {
     auto offsets_buff =
@@ -120,7 +121,7 @@ std::unique_ptr<column> make_lists_column(size_type num_rows,
                                           size_type null_count,
                                           rmm::device_buffer&& null_mask,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   if (null_count > 0) { CUDF_EXPECTS(null_mask.size() > 0, "Column with nulls must be nullable."); }
   CUDF_EXPECTS(
diff --git a/cpp/src/lists/reverse.cu b/cpp/src/lists/reverse.cu
index 6c00f8b64b4..d913ce070ae 100644
--- a/cpp/src/lists/reverse.cu
+++ b/cpp/src/lists/reverse.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -36,7 +37,7 @@ namespace detail {
 
 std::unique_ptr<column> reverse(lists_column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return cudf::empty_like(input.parent()); }
 
@@ -88,7 +89,7 @@ std::unique_ptr<column> reverse(lists_column_view const& input,
 
 std::unique_ptr<column> reverse(lists_column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::reverse(input, stream, mr);
diff --git a/cpp/src/lists/segmented_sort.cu b/cpp/src/lists/segmented_sort.cu
index 0b70773f4b2..f920fb916eb 100644
--- a/cpp/src/lists/segmented_sort.cu
+++ b/cpp/src/lists/segmented_sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
@@ -43,7 +44,7 @@ namespace {
  */
 std::unique_ptr<column> build_output_offsets(lists_column_view const& input,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   auto output_offset = make_numeric_column(
     input.offsets().type(), input.size() + 1, mask_state::UNALLOCATED, stream, mr);
@@ -63,7 +64,7 @@ std::unique_ptr<column> sort_lists(lists_column_view const& input,
                                    order column_order,
                                    null_order null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return empty_like(input.parent());
 
@@ -91,7 +92,7 @@ std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
                                           order column_order,
                                           null_order null_precedence,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return empty_like(input.parent()); }
 
@@ -120,7 +121,7 @@ std::unique_ptr<column> sort_lists(lists_column_view const& input,
                                    order column_order,
                                    null_order null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sort_lists(input, column_order, null_precedence, stream, mr);
@@ -130,7 +131,7 @@ std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
                                           order column_order,
                                           null_order null_precedence,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_sort_lists(input, column_order, null_precedence, stream, mr);
diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu
index f92ba782da7..cb14ae7619b 100644
--- a/cpp/src/lists/sequences.cu
+++ b/cpp/src/lists/sequences.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -88,7 +89,7 @@ struct sequences_dispatcher {
                                      std::optional<column_view> const& steps,
                                      size_type const* offsets,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return sequences_functor<T>::invoke(n_lists, n_elements, starts, steps, offsets, stream, mr);
   }
@@ -108,7 +109,7 @@ struct sequences_functor<T, std::enable_if_t<is_supported<T>()>> {
                                         std::optional<column_view> const& steps,
                                         size_type const* offsets,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     auto result =
       make_fixed_width_column(starts.type(), n_elements, mask_state::UNALLOCATED, stream, mr);
@@ -132,7 +133,7 @@ std::unique_ptr<column> sequences(column_view const& starts,
                                   std::optional<column_view> const& steps,
                                   column_view const& sizes,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!starts.has_nulls() && !sizes.has_nulls(),
                "starts and sizes input columns must not have nulls.");
@@ -190,7 +191,7 @@ std::unique_ptr<column> sequences(column_view const& starts,
 std::unique_ptr<column> sequences(column_view const& starts,
                                   column_view const& sizes,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   return sequences(starts, std::nullopt, sizes, stream, mr);
 }
@@ -199,7 +200,7 @@ std::unique_ptr<column> sequences(column_view const& starts,
                                   column_view const& steps,
                                   column_view const& sizes,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   return sequences(starts, std::optional<column_view>{steps}, sizes, stream, mr);
 }
@@ -209,7 +210,7 @@ std::unique_ptr<column> sequences(column_view const& starts,
 std::unique_ptr<column> sequences(column_view const& starts,
                                   column_view const& sizes,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sequences(starts, sizes, stream, mr);
@@ -219,7 +220,7 @@ std::unique_ptr<column> sequences(column_view const& starts,
                                   column_view const& steps,
                                   column_view const& sizes,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sequences(starts, steps, sizes, stream, mr);
diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu
index 5735c84e3d3..f3352a3a52d 100644
--- a/cpp/src/lists/set_operations.cu
+++ b/cpp/src/lists/set_operations.cu
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/functional.h>
@@ -62,7 +63,7 @@ std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
                                      null_equality nulls_equal,
                                      nan_equality nans_equal,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   check_compatibility(lhs, rhs);
 
@@ -132,7 +133,7 @@ std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
                                            null_equality nulls_equal,
                                            nan_equality nans_equal,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   check_compatibility(lhs, rhs);
 
@@ -193,7 +194,7 @@ std::unique_ptr<column> union_distinct(lists_column_view const& lhs,
                                        null_equality nulls_equal,
                                        nan_equality nans_equal,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   check_compatibility(lhs, rhs);
 
@@ -214,7 +215,7 @@ std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
                                             null_equality nulls_equal,
                                             nan_equality nans_equal,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   check_compatibility(lhs, rhs);
 
@@ -279,7 +280,7 @@ std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
                                      null_equality nulls_equal,
                                      nan_equality nans_equal,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::have_overlap(lhs, rhs, nulls_equal, nans_equal, stream, mr);
@@ -290,7 +291,7 @@ std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
                                            null_equality nulls_equal,
                                            nan_equality nans_equal,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::intersect_distinct(lhs, rhs, nulls_equal, nans_equal, stream, mr);
@@ -301,7 +302,7 @@ std::unique_ptr<column> union_distinct(lists_column_view const& lhs,
                                        null_equality nulls_equal,
                                        nan_equality nans_equal,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::union_distinct(lhs, rhs, nulls_equal, nans_equal, stream, mr);
@@ -312,7 +313,7 @@ std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
                                             null_equality nulls_equal,
                                             nan_equality nans_equal,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::difference_distinct(lhs, rhs, nulls_equal, nans_equal, stream, mr);
diff --git a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
index ce972d89150..71aafa3ce12 100644
--- a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
+++ b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/reduce.h>
 #include <thrust/scan.h>
@@ -39,7 +40,7 @@ namespace detail {
 std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
                                            lists_column_view const& boolean_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(boolean_mask.child().type().id() == type_id::BOOL8, "Mask must be of type BOOL8.");
   CUDF_EXPECTS(input.size() == boolean_mask.size(),
@@ -102,7 +103,7 @@ std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
 std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
                                            lists_column_view const& boolean_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::apply_boolean_mask(input, boolean_mask, stream, mr);
diff --git a/cpp/src/lists/stream_compaction/distinct.cu b/cpp/src/lists/stream_compaction/distinct.cu
index c8d9c15706f..40dee010bd5 100644
--- a/cpp/src/lists/stream_compaction/distinct.cu
+++ b/cpp/src/lists/stream_compaction/distinct.cu
@@ -26,6 +26,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <utility>
@@ -37,7 +38,7 @@ std::unique_ptr<column> distinct(lists_column_view const& input,
                                  null_equality nulls_equal,
                                  nan_equality nans_equal,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   // Algorithm:
   // - Generate labels for the child elements.
@@ -77,7 +78,7 @@ std::unique_ptr<column> distinct(lists_column_view const& input,
                                  null_equality nulls_equal,
                                  nan_equality nans_equal,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::distinct(input, nulls_equal, nans_equal, stream, mr);
diff --git a/cpp/src/lists/utilities.cu b/cpp/src/lists/utilities.cu
index 2c4966c969e..7fb960f02ca 100644
--- a/cpp/src/lists/utilities.cu
+++ b/cpp/src/lists/utilities.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,12 +20,14 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/labeling/label_segments.cuh>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf::lists::detail {
 
 std::unique_ptr<column> generate_labels(lists_column_view const& input,
                                         size_type n_elements,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   auto labels = make_numeric_column(
     data_type(type_to_id<size_type>()), n_elements, cudf::mask_state::UNALLOCATED, stream, mr);
@@ -38,7 +40,7 @@ std::unique_ptr<column> generate_labels(lists_column_view const& input,
 std::unique_ptr<column> reconstruct_offsets(column_view const& labels,
                                             size_type n_lists,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 
 {
   auto out_offsets = make_numeric_column(
@@ -56,7 +58,7 @@ std::unique_ptr<column> reconstruct_offsets(column_view const& labels,
 
 std::unique_ptr<column> get_normalized_offsets(lists_column_view const& input,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return empty_like(input.offsets()); }
 
diff --git a/cpp/src/lists/utilities.hpp b/cpp/src/lists/utilities.hpp
index c881e828677..218ad7872e9 100644
--- a/cpp/src/lists/utilities.hpp
+++ b/cpp/src/lists/utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::lists::detail {
 
@@ -36,7 +37,7 @@ namespace cudf::lists::detail {
 std::unique_ptr<column> generate_labels(lists_column_view const& input,
                                         size_type n_elements,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Reconstruct an offsets column from the input list labels column.
@@ -50,7 +51,7 @@ std::unique_ptr<column> generate_labels(lists_column_view const& input,
 std::unique_ptr<column> reconstruct_offsets(column_view const& labels,
                                             size_type n_lists,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Generate 0-based list offsets from the offsets of the input lists column.
@@ -62,6 +63,6 @@ std::unique_ptr<column> reconstruct_offsets(column_view const& labels,
  */
 std::unique_ptr<column> get_normalized_offsets(lists_column_view const& input,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr);
+                                               rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::lists::detail
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 8be503025bd..4463b16df78 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -38,6 +38,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -347,7 +348,7 @@ struct column_merger {
   std::unique_ptr<column> operator()(column_view const&,
                                      column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("Unsupported type for merge.");
   }
@@ -359,7 +360,7 @@ struct column_merger {
     column_view const& lcol,
     column_view const& rcol,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
+    rmm::device_async_resource_ref mr) const
   {
     auto lsz         = lcol.size();
     auto merged_size = lsz + rcol.size();
@@ -431,7 +432,7 @@ std::unique_ptr<column> column_merger::operator()<cudf::string_view>(
   column_view const& lcol,
   column_view const& rcol,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+  rmm::device_async_resource_ref mr) const
 {
   auto column = strings::detail::merge<index_type>(strings_column_view(lcol),
                                                    strings_column_view(rcol),
@@ -453,7 +454,7 @@ std::unique_ptr<column> column_merger::operator()<cudf::dictionary32>(
   column_view const& lcol,
   column_view const& rcol,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+  rmm::device_async_resource_ref mr) const
 {
   auto result = cudf::dictionary::detail::merge(
     cudf::dictionary_column_view(lcol), cudf::dictionary_column_view(rcol), row_order_, stream, mr);
@@ -473,7 +474,7 @@ std::unique_ptr<column> column_merger::operator()<cudf::list_view>(
   column_view const& lcol,
   column_view const& rcol,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+  rmm::device_async_resource_ref mr) const
 {
   std::vector<column_view> columns{lcol, rcol};
   auto concatenated_list = cudf::lists::detail::concatenate(columns, stream, mr);
@@ -501,7 +502,7 @@ std::unique_ptr<column> column_merger::operator()<cudf::struct_view>(
   column_view const& lcol,
   column_view const& rcol,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+  rmm::device_async_resource_ref mr) const
 {
   // merge each child.
   auto const lhs = structs_column_view{lcol};
@@ -550,7 +551,7 @@ table_ptr_type merge(cudf::table_view const& left_table,
                      std::vector<cudf::order> const& column_order,
                      std::vector<cudf::null_order> const& null_precedence,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr)
+                     rmm::device_async_resource_ref mr)
 {
   // collect index columns for lhs, rhs, resp.
   //
@@ -620,7 +621,7 @@ table_ptr_type merge(std::vector<table_view> const& tables_to_merge,
                      std::vector<cudf::order> const& column_order,
                      std::vector<cudf::null_order> const& null_precedence,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr)
+                     rmm::device_async_resource_ref mr)
 {
   if (tables_to_merge.empty()) { return std::make_unique<cudf::table>(); }
 
@@ -702,7 +703,7 @@ std::unique_ptr<cudf::table> merge(std::vector<table_view> const& tables_to_merg
                                    std::vector<cudf::size_type> const& key_cols,
                                    std::vector<cudf::order> const& column_order,
                                    std::vector<cudf::null_order> const& null_precedence,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::merge(
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 0d2daaddb8c..f10388794fc 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/block/block_scan.cuh>
 #include <cub/device/device_histogram.cuh>
@@ -413,7 +414,7 @@ struct copy_block_partitions_dispatcher {
                                      size_type const* scanned_block_partition_sizes,
                                      size_type grid_size,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     rmm::device_buffer output(input.size() * sizeof(DataType), stream, mr);
 
@@ -441,7 +442,7 @@ struct copy_block_partitions_dispatcher {
                                      size_type const* scanned_block_partition_sizes,
                                      size_type grid_size,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     // Use move_to_output_buffer to create an equivalent gather map
     auto gather_map = compute_gather_map(input.size(),
@@ -471,7 +472,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
   size_type num_partitions,
   uint32_t seed,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto const num_rows = table_to_hash.num_rows();
 
@@ -658,7 +659,7 @@ struct dispatch_map_type {
              column_view const& partition_map,
              size_type num_partitions,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr) const
+             rmm::device_async_resource_ref mr) const
   {
     // Build a histogram of the number of rows in each partition
     rmm::device_uvector<size_type> histogram(num_partitions + 1, stream);
@@ -761,7 +762,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
   int num_partitions,
   uint32_t seed,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto table_to_hash = input.select(columns_to_hash);
 
@@ -785,7 +786,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> partition(
   column_view const& partition_map,
   size_type num_partitions,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(t.num_rows() == partition_map.size(),
                "Size mismatch between table and partition map.");
@@ -809,7 +810,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
   hash_id hash_function,
   uint32_t seed,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -833,7 +834,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> partition(
   table_view const& t,
   column_view const& partition_map,
   size_type num_partitions,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::partition(t, partition_map, num_partitions, cudf::get_default_stream(), mr);
diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu
index 3283a7c35ee..82b169c78ed 100644
--- a/cpp/src/partitioning/round_robin.cu
+++ b/cpp/src/partitioning/round_robin.cu
@@ -30,6 +30,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/copy.h>
@@ -83,7 +84,7 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> degenerate
   cudf::size_type num_partitions,
   cudf::size_type start_partition,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto nrows = input.num_rows();
 
@@ -157,7 +158,7 @@ std::pair<std::unique_ptr<table>, std::vector<cudf::size_type>> round_robin_part
   cudf::size_type num_partitions,
   cudf::size_type start_partition,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto nrows = input.num_rows();
 
@@ -270,8 +271,8 @@ std::pair<std::unique_ptr<table>, std::vector<cudf::size_type>> round_robin_part
 std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> round_robin_partition(
   table_view const& input,
   cudf::size_type num_partitions,
-  cudf::size_type start_partition     = 0,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  cudf::size_type start_partition   = 0,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   CUDF_FUNC_RANGE();
   return detail::round_robin_partition(
diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu
index cba7203483b..b25254cfe49 100644
--- a/cpp/src/quantiles/quantile.cu
+++ b/cpp/src/quantiles/quantile.cu
@@ -33,6 +33,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -53,7 +54,7 @@ struct quantile_functor {
   interpolation interp;
   bool retain_types;
   rmm::cuda_stream_view stream;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 
   template <typename T>
   std::enable_if_t<not std::is_arithmetic_v<T> and not cudf::is_fixed_point<T>(),
@@ -145,7 +146,7 @@ std::unique_ptr<column> quantile(column_view const& input,
                                  interpolation interp,
                                  bool retain_types,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   auto functor = quantile_functor<exact, SortMapIterator>{
     ordered_indices, size, q, interp, retain_types, stream, mr};
@@ -163,7 +164,7 @@ std::unique_ptr<column> quantile(column_view const& input,
                                  column_view const& indices,
                                  bool exact,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   if (indices.is_empty()) {
     auto begin = thrust::make_counting_iterator<size_type>(0);
@@ -193,7 +194,7 @@ std::unique_ptr<column> quantile(column_view const& input,
                                  interpolation interp,
                                  column_view const& ordered_indices,
                                  bool exact,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::quantile(input, q, interp, ordered_indices, exact, cudf::get_default_stream(), mr);
diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu
index 8fee821dfc4..c0f536536ce 100644
--- a/cpp/src/quantiles/quantiles.cu
+++ b/cpp/src/quantiles/quantiles.cu
@@ -27,6 +27,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -43,7 +44,7 @@ std::unique_ptr<table> quantiles(table_view const& input,
                                  std::vector<double> const& q,
                                  interpolation interp,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   auto quantile_idx_lookup = cuda::proclaim_return_type<size_type>(
     [sortmap, interp, size = input.num_rows()] __device__(double q) {
@@ -71,7 +72,7 @@ std::unique_ptr<table> quantiles(table_view const& input,
                                  std::vector<order> const& column_order,
                                  std::vector<null_order> const& null_precedence,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   if (q.empty()) { return empty_like(input); }
 
@@ -99,7 +100,7 @@ std::unique_ptr<table> quantiles(table_view const& input,
                                  cudf::sorted is_input_sorted,
                                  std::vector<order> const& column_order,
                                  std::vector<null_order> const& null_precedence,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::quantiles(input,
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 96b0355c6e5..da36b7ab1da 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/advance.h>
@@ -184,7 +185,7 @@ CUDF_KERNEL void compute_percentiles_kernel(device_span<size_type const> tdigest
 std::unique_ptr<column> compute_approx_percentiles(tdigest_column_view const& input,
                                                    column_view const& percentiles,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   tdigest_column_view tdv(input);
 
@@ -259,7 +260,7 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
                                             std::unique_ptr<column>&& min_values,
                                             std::unique_ptr<column>&& max_values,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(tdigest_offsets->size() == num_rows + 1,
                "Encountered unexpected offset count in make_tdigest_column");
@@ -291,7 +292,7 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
 }
 
 std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   auto offsets = cudf::make_fixed_width_column(
     data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr);
@@ -334,7 +335,7 @@ std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
  * @returns An empty tdigest scalar.
  */
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   auto contents = make_empty_tdigest_column(stream, mr)->release();
   return std::make_unique<struct_scalar>(
@@ -346,7 +347,7 @@ std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
 std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
                                           column_view const& percentiles,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   tdigest_column_view tdv(input);
   CUDF_EXPECTS(percentiles.type().id() == type_id::FLOAT64,
@@ -407,7 +408,7 @@ std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
 
 std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
                                           column_view const& percentiles,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return tdigest::percentile_approx(input, percentiles, cudf::get_default_stream(), mr);
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index 8544d9caa56..229af89fc46 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -33,6 +33,7 @@
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/advance.h>
@@ -333,7 +334,7 @@ __device__ double scale_func_k1(double quantile, double delta_norm)
 // convert a single-row tdigest column to a scalar.
 std::unique_ptr<scalar> to_tdigest_scalar(std::unique_ptr<column>&& tdigest,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(tdigest->size() == 1,
                "Encountered invalid tdigest column when converting to scalar");
@@ -517,7 +518,7 @@ generate_group_cluster_info(int delta,
                             CumulativeWeight cumulative_weight,
                             bool has_nulls,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   constexpr size_type block_size = 256;
   cudf::detail::grid_1d const grid(num_groups, block_size);
@@ -581,7 +582,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                                             std::unique_ptr<column>&& max_col,
                                             bool has_nulls,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   // whether or not this weight is a stub
   auto is_stub_weight = [weights = weights->view().begin<double>()] __device__(size_type i) {
@@ -732,7 +733,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
                                          size_type total_clusters,
                                          bool has_nulls,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   // the output for each group is a column of data that represents the tdigest. since we want 1 row
   // per group, each row will be a list the length of the tdigest for that group. so our output
@@ -841,7 +842,7 @@ struct typed_group_tdigest {
                                      size_type num_groups,
                                      int delta,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     // first, generate cluster weight information for each input group
     auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info(
@@ -907,7 +908,7 @@ struct typed_reduce_tdigest {
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      int delta,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     // treat this the same as the groupby path with a single group.  Note:  even though
     // there is only 1 group there are still multiple keys within the group that represent
@@ -1029,7 +1030,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
                                        size_type num_groups,
                                        int max_centroids,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   // thrust::merge and thrust::merge_by_key don't provide what we need.  What we would need is an
   // algorithm like a super-merge that takes two layers of keys: one which identifies the outer
@@ -1211,7 +1212,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
 std::unique_ptr<scalar> reduce_tdigest(column_view const& col,
                                        int max_centroids,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_scalar(stream, mr); }
 
@@ -1234,7 +1235,7 @@ struct group_offsets_fn {
 std::unique_ptr<scalar> reduce_merge_tdigest(column_view const& input,
                                              int max_centroids,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   tdigest_column_view tdv(input);
 
@@ -1264,7 +1265,7 @@ std::unique_ptr<column> group_tdigest(column_view const& col,
                                       size_type num_groups,
                                       int max_centroids,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
 
@@ -1287,7 +1288,7 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
                                             size_type num_groups,
                                             int max_centroids,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   tdigest_column_view tdv(input);
 
diff --git a/cpp/src/reductions/all.cu b/cpp/src/reductions/all.cu
index 6cea4e4ada3..11b0e2732fe 100644
--- a/cpp/src/reductions/all.cu
+++ b/cpp/src/reductions/all.cu
@@ -19,6 +19,8 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda/atomic>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -55,7 +57,7 @@ struct all_fn {
   template <typename T, std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto const d_dict = cudf::column_device_view::create(input, stream);
     auto const iter   = [&] {
@@ -74,7 +76,7 @@ struct all_fn {
   template <typename T, std::enable_if_t<!std::is_arithmetic_v<T>>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Unexpected key type for dictionary in reduction all()");
   }
@@ -86,7 +88,7 @@ std::unique_ptr<cudf::scalar> all(column_view const& col,
                                   cudf::data_type const output_dtype,
                                   std::optional<std::reference_wrapper<scalar const>> init,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8),
                "all() operation can be applied with output type `BOOL8` only");
diff --git a/cpp/src/reductions/any.cu b/cpp/src/reductions/any.cu
index c0c044a1e6f..0ebeb7a48b9 100644
--- a/cpp/src/reductions/any.cu
+++ b/cpp/src/reductions/any.cu
@@ -19,6 +19,8 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda/atomic>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -55,7 +57,7 @@ struct any_fn {
   template <typename T, std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto const d_dict = cudf::column_device_view::create(input, stream);
     auto const iter   = [&] {
@@ -74,7 +76,7 @@ struct any_fn {
   template <typename T, std::enable_if_t<!std::is_arithmetic_v<T>>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Unexpected key type for dictionary in reduction any()");
   }
@@ -86,7 +88,7 @@ std::unique_ptr<cudf::scalar> any(column_view const& col,
                                   cudf::data_type const output_dtype,
                                   std::optional<std::reference_wrapper<scalar const>> init,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8),
                "any() operation can be applied with output type `bool8` only");
diff --git a/cpp/src/reductions/collect_ops.cu b/cpp/src/reductions/collect_ops.cu
index 743eddbffaf..c1a1f117ee1 100644
--- a/cpp/src/reductions/collect_ops.cu
+++ b/cpp/src/reductions/collect_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -45,7 +47,7 @@ bool need_handle_nulls(column_view const& input, null_policy null_handling)
 std::unique_ptr<scalar> collect_list(column_view const& col,
                                      null_policy null_handling,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   if (need_handle_nulls(col, null_handling)) {
     auto d_view             = column_device_view::create(col, stream);
@@ -61,7 +63,7 @@ std::unique_ptr<scalar> collect_list(column_view const& col,
 
 std::unique_ptr<scalar> merge_lists(lists_column_view const& col,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto flatten_col = col.get_sliced_child(stream);
   return make_list_scalar(flatten_col, stream, mr);
@@ -72,7 +74,7 @@ std::unique_ptr<scalar> collect_set(column_view const& col,
                                     null_equality nulls_equal,
                                     nan_equality nans_equal,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   // `input_as_collect_list` is the result of the input column that has been processed to obey
   // the given null handling behavior.
@@ -101,7 +103,7 @@ std::unique_ptr<scalar> merge_sets(lists_column_view const& col,
                                    null_equality nulls_equal,
                                    nan_equality nans_equal,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   auto flatten_col    = col.get_sliced_child(stream);
   auto distinct_table = cudf::detail::distinct(table_view{{flatten_col}},
diff --git a/cpp/src/reductions/compound.cuh b/cpp/src/reductions/compound.cuh
index 3428130d912..aa71546f049 100644
--- a/cpp/src/reductions/compound.cuh
+++ b/cpp/src/reductions/compound.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,8 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/iterator/transform_iterator.h>
 
 namespace cudf {
@@ -48,7 +50,7 @@ std::unique_ptr<scalar> compound_reduction(column_view const& col,
                                            data_type const output_dtype,
                                            size_type ddof,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto const valid_count = col.size() - col.null_count();
 
@@ -101,7 +103,7 @@ struct result_type_dispatcher {
                                      cudf::data_type const output_dtype,
                                      size_type ddof,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return compound_reduction<ElementType, ResultType, Op>(col, output_dtype, ddof, stream, mr);
   }
@@ -111,7 +113,7 @@ struct result_type_dispatcher {
                                      cudf::data_type const output_dtype,
                                      size_type ddof,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     CUDF_FAIL("Unsupported output data type");
   }
@@ -134,7 +136,7 @@ struct element_type_dispatcher {
                                      cudf::data_type const output_dtype,
                                      size_type ddof,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return cudf::type_dispatcher(
       output_dtype, result_type_dispatcher<ElementType, Op>(), col, output_dtype, ddof, stream, mr);
@@ -145,7 +147,7 @@ struct element_type_dispatcher {
                                      cudf::data_type const output_dtype,
                                      size_type ddof,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     CUDF_FAIL(
       "Reduction operators other than `min` and `max`"
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 3e46a34cc6a..bebb9d14923 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -21,6 +21,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda/atomic>
 #include <cuda/functional>
 #include <thrust/copy.h>
@@ -114,7 +116,7 @@ auto gather_histogram(table_view const& input,
                       device_span<size_type const> distinct_indices,
                       std::unique_ptr<column>&& distinct_counts,
                       rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr)
+                      rmm::device_async_resource_ref mr)
 {
   auto distinct_rows = cudf::detail::gather(input,
                                             distinct_indices,
@@ -152,7 +154,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>, std::unique_ptr<colum
 compute_row_frequencies(table_view const& input,
                         std::optional<column_view> const& partial_counts,
                         rmm::cuda_stream_view stream,
-                        rmm::mr::device_memory_resource* mr)
+                        rmm::device_async_resource_ref mr)
 {
   auto const has_nested_columns = cudf::detail::has_nested_columns(input);
 
@@ -236,7 +238,7 @@ compute_row_frequencies(table_view const& input,
 
 std::unique_ptr<cudf::scalar> histogram(column_view const& input,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   // Empty group should be handled before reaching here.
   CUDF_EXPECTS(input.size() > 0, "Input should not be empty.", std::invalid_argument);
@@ -249,7 +251,7 @@ std::unique_ptr<cudf::scalar> histogram(column_view const& input,
 
 std::unique_ptr<cudf::scalar> merge_histogram(column_view const& input,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   // Empty group should be handled before reaching here.
   CUDF_EXPECTS(input.size() > 0, "Input should not be empty.", std::invalid_argument);
diff --git a/cpp/src/reductions/max.cu b/cpp/src/reductions/max.cu
index 1cf2b6f53b6..682889f0fee 100644
--- a/cpp/src/reductions/max.cu
+++ b/cpp/src/reductions/max.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -29,7 +30,7 @@ std::unique_ptr<cudf::scalar> max(column_view const& col,
                                   cudf::data_type const output_dtype,
                                   std::optional<std::reference_wrapper<scalar const>> init,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto const input_type =
     cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type() : col.type();
diff --git a/cpp/src/reductions/mean.cu b/cpp/src/reductions/mean.cu
index e64660932ce..e8a10f02cc1 100644
--- a/cpp/src/reductions/mean.cu
+++ b/cpp/src/reductions/mean.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -28,7 +29,7 @@ namespace detail {
 std::unique_ptr<cudf::scalar> mean(column_view const& col,
                                    cudf::data_type const output_dtype,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   auto col_type =
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type();
diff --git a/cpp/src/reductions/min.cu b/cpp/src/reductions/min.cu
index 792965e8b99..7986bda5751 100644
--- a/cpp/src/reductions/min.cu
+++ b/cpp/src/reductions/min.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -26,7 +28,7 @@ std::unique_ptr<cudf::scalar> min(column_view const& col,
                                   data_type const output_dtype,
                                   std::optional<std::reference_wrapper<scalar const>> init,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto const input_type =
     cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type() : col.type();
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index c4eb09110c6..62a1f4aab7c 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/extrema.h>
 #include <thrust/functional.h>
@@ -190,7 +191,7 @@ struct minmax_functor {
             std::enable_if_t<is_supported<T>() and !std::is_same_v<T, cudf::string_view> and
                              !cudf::is_dictionary<T>()>* = nullptr>
   std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> operator()(
-    cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+    cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   {
     using storage_type = device_storage_type_t<T>;
     // compute minimum and maximum values
@@ -210,7 +211,7 @@ struct minmax_functor {
    */
   template <typename T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>* = nullptr>
   std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> operator()(
-    cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+    cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   {
     // compute minimum and maximum values
     auto dev_result = reduce<cudf::string_view>(col, stream);
@@ -229,7 +230,7 @@ struct minmax_functor {
    */
   template <typename T, std::enable_if_t<cudf::is_dictionary<T>()>* = nullptr>
   std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> operator()(
-    cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+    cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   {
     // compute minimum and maximum values
     auto dev_result = reduce<T>(col, stream);
@@ -246,7 +247,7 @@ struct minmax_functor {
 
   template <typename T, std::enable_if_t<!is_supported<T>()>* = nullptr>
   std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> operator()(
-    cudf::column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+    cudf::column_view const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
   {
     CUDF_FAIL("type not supported for minmax() operation");
   }
@@ -260,7 +261,7 @@ struct minmax_functor {
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
-  cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   if (col.null_count() == col.size()) {
     // this handles empty and all-null columns
@@ -274,7 +275,7 @@ std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
 }  // namespace detail
 
 std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
-  column_view const& col, rmm::mr::device_memory_resource* mr)
+  column_view const& col, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::minmax(col, cudf::get_default_stream(), mr);
diff --git a/cpp/src/reductions/nth_element.cu b/cpp/src/reductions/nth_element.cu
index 88a1778bb7b..e266f477c5d 100644
--- a/cpp/src/reductions/nth_element.cu
+++ b/cpp/src/reductions/nth_element.cu
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -35,7 +36,7 @@ std::unique_ptr<cudf::scalar> nth_element(column_view const& col,
                                           size_type n,
                                           null_policy null_handling,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(n >= -col.size() and n < col.size(), "Index out of bounds");
   auto wrap_n = [n](size_type size) { return (n < 0 ? size + n : n); };
diff --git a/cpp/src/reductions/product.cu b/cpp/src/reductions/product.cu
index 2e483813939..28ff8db3708 100644
--- a/cpp/src/reductions/product.cu
+++ b/cpp/src/reductions/product.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -29,7 +30,7 @@ std::unique_ptr<cudf::scalar> product(column_view const& col,
                                       cudf::data_type const output_dtype,
                                       std::optional<std::reference_wrapper<scalar const>> init,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   return cudf::type_dispatcher(
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type(),
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index cd1669d1d6b..d764ea7559f 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -30,6 +30,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -38,14 +39,14 @@ struct reduce_dispatch_functor {
   column_view const col;
   data_type output_dtype;
   std::optional<std::reference_wrapper<scalar const>> init;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
   rmm::cuda_stream_view stream;
 
   reduce_dispatch_functor(column_view const& col,
                           data_type output_dtype,
                           std::optional<std::reference_wrapper<scalar const>> init,
                           rmm::cuda_stream_view stream,
-                          rmm::mr::device_memory_resource* mr)
+                          rmm::device_async_resource_ref mr)
     : col(col), output_dtype(output_dtype), init(init), mr(mr), stream(stream)
   {
   }
@@ -151,7 +152,7 @@ std::unique_ptr<scalar> reduce(column_view const& col,
                                data_type output_dtype,
                                std::optional<std::reference_wrapper<scalar const>> init,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!init.has_value() || col.type() == init.value().get().type(),
                "column and initial value must be the same type");
@@ -204,7 +205,7 @@ std::unique_ptr<scalar> reduce(column_view const& col,
 std::unique_ptr<scalar> reduce(column_view const& col,
                                reduce_aggregation const& agg,
                                data_type output_dtype,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return reduction::detail::reduce(
@@ -215,7 +216,7 @@ std::unique_ptr<scalar> reduce(column_view const& col,
                                reduce_aggregation const& agg,
                                data_type output_dtype,
                                std::optional<std::reference_wrapper<scalar const>> init,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return reduction::detail::reduce(col, agg, output_dtype, init, cudf::get_default_stream(), mr);
diff --git a/cpp/src/reductions/scan/rank_scan.cu b/cpp/src/reductions/scan/rank_scan.cu
index 538763099d3..0befb6ac7d7 100644
--- a/cpp/src/reductions/scan/rank_scan.cu
+++ b/cpp/src/reductions/scan/rank_scan.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/scan.h>
 #include <thrust/tabulate.h>
@@ -66,7 +67,7 @@ std::unique_ptr<column> rank_generator(column_view const& order_by,
                                        value_resolver resolver,
                                        scan_operator scan_op,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto const order_by_tview = table_view{{order_by}};
   auto comp = cudf::experimental::row::equality::self_comparator(order_by_tview, stream);
@@ -105,7 +106,7 @@ std::unique_ptr<column> rank_generator(column_view const& order_by,
 
 std::unique_ptr<column> inclusive_dense_rank_scan(column_view const& order_by,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   return rank_generator(
     order_by,
@@ -117,7 +118,7 @@ std::unique_ptr<column> inclusive_dense_rank_scan(column_view const& order_by,
 
 std::unique_ptr<column> inclusive_rank_scan(column_view const& order_by,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!cudf::structs::detail::is_or_has_nested_lists(order_by),
                "Unsupported list type in rank scan.");
@@ -130,7 +131,7 @@ std::unique_ptr<column> inclusive_rank_scan(column_view const& order_by,
 }
 
 std::unique_ptr<column> inclusive_one_normalized_percent_rank_scan(
-  column_view const& order_by, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  column_view const& order_by, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   auto const rank_column =
     inclusive_rank_scan(order_by, stream, rmm::mr::get_current_device_resource());
diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp
index 2871ee283ba..b6e8690a6c9 100644
--- a/cpp/src/reductions/scan/scan.cpp
+++ b/cpp/src/reductions/scan/scan.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,8 @@
 #include <cudf/reduction.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 
 namespace detail {
@@ -29,7 +31,7 @@ std::unique_ptr<column> scan(column_view const& input,
                              scan_type inclusive,
                              null_policy null_handling,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   if (agg.kind == aggregation::RANK) {
     CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE,
@@ -58,7 +60,7 @@ std::unique_ptr<column> scan(column_view const& input,
                              scan_aggregation const& agg,
                              scan_type inclusive,
                              null_policy null_handling,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::scan(input, agg, inclusive, null_handling, cudf::get_default_stream(), mr);
diff --git a/cpp/src/reductions/scan/scan.cuh b/cpp/src/reductions/scan/scan.cuh
index e575bde0ce0..aeb9e516cd4 100644
--- a/cpp/src/reductions/scan/scan.cuh
+++ b/cpp/src/reductions/scan/scan.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <utility>
 
@@ -33,14 +34,14 @@ namespace detail {
 std::pair<rmm::device_buffer, size_type> mask_scan(column_view const& input_view,
                                                    scan_type inclusive,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr);
+                                                   rmm::device_async_resource_ref mr);
 
 template <template <typename> typename DispatchFn>
 std::unique_ptr<column> scan_agg_dispatch(column_view const& input,
                                           scan_aggregation const& agg,
                                           bitmask_type const* output_mask,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   switch (agg.kind) {
     case aggregation::SUM:
diff --git a/cpp/src/reductions/scan/scan_exclusive.cu b/cpp/src/reductions/scan/scan_exclusive.cu
index 47301ad91f6..7224bf47390 100644
--- a/cpp/src/reductions/scan/scan_exclusive.cu
+++ b/cpp/src/reductions/scan/scan_exclusive.cu
@@ -26,6 +26,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/scan.h>
@@ -56,7 +57,7 @@ struct scan_dispatcher {
   std::unique_ptr<column> operator()(column_view const& input,
                                      bitmask_type const*,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto output_column =
       detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream, mr);
@@ -89,7 +90,7 @@ std::unique_ptr<column> scan_exclusive(column_view const& input,
                                        scan_aggregation const& agg,
                                        null_policy null_handling,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto [mask, null_count] = [&] {
     if (null_handling == null_policy::EXCLUDE) {
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 7edf89a0c91..ad2eaa6a471 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/find.h>
 #include <thrust/functional.h>
@@ -45,7 +46,7 @@ namespace detail {
 std::pair<rmm::device_buffer, size_type> mask_scan(column_view const& input_view,
                                                    scan_type inclusive,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   rmm::device_buffer mask =
     detail::create_null_mask(input_view.size(), mask_state::UNINITIALIZED, stream, mr);
@@ -74,7 +75,7 @@ struct scan_functor {
   static std::unique_ptr<column> invoke(column_view const& input_view,
                                         bitmask_type const*,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     auto output_column = detail::allocate_like(
       input_view, input_view.size(), mask_allocation_policy::NEVER, stream, mr);
@@ -99,7 +100,7 @@ struct scan_functor<Op, cudf::string_view> {
   static std::unique_ptr<column> invoke(column_view const& input_view,
                                         bitmask_type const* mask,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     return cudf::strings::detail::scan_inclusive<Op>(input_view, mask, stream, mr);
   }
@@ -110,7 +111,7 @@ struct scan_functor<Op, cudf::struct_view> {
   static std::unique_ptr<column> invoke(column_view const& input,
                                         bitmask_type const*,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     return cudf::structs::detail::scan_inclusive<Op>(input, stream, mr);
   }
@@ -150,7 +151,7 @@ struct scan_dispatcher {
   std::unique_ptr<column> operator()(column_view const& input,
                                      bitmask_type const* output_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return scan_functor<Op, T>::invoke(input, output_mask, stream, mr);
   }
@@ -168,7 +169,7 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        scan_aggregation const& agg,
                                        null_policy null_handling,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto [mask, null_count] = [&] {
     if (null_handling == null_policy::EXCLUDE) {
diff --git a/cpp/src/reductions/segmented/all.cu b/cpp/src/reductions/segmented/all.cu
index b81a088155c..489fc6a283c 100644
--- a/cpp/src/reductions/segmented/all.cu
+++ b/cpp/src/reductions/segmented/all.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -29,7 +31,7 @@ std::unique_ptr<cudf::column> segmented_all(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8),
                "segmented_all() operation requires output type `BOOL8`");
diff --git a/cpp/src/reductions/segmented/any.cu b/cpp/src/reductions/segmented/any.cu
index 9210fbd3c7c..a9a8528548a 100644
--- a/cpp/src/reductions/segmented/any.cu
+++ b/cpp/src/reductions/segmented/any.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -29,7 +31,7 @@ std::unique_ptr<cudf::column> segmented_any(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8),
                "segmented_any() operation requires output type `BOOL8`");
diff --git a/cpp/src/reductions/segmented/compound.cuh b/cpp/src/reductions/segmented/compound.cuh
index 395ad4c1dc9..035a8bdcd75 100644
--- a/cpp/src/reductions/segmented/compound.cuh
+++ b/cpp/src/reductions/segmented/compound.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/adjacent_difference.h>
 #include <thrust/iterator/transform_iterator.h>
 
@@ -54,7 +56,7 @@ std::unique_ptr<column> compound_segmented_reduction(column_view const& col,
                                                      null_policy null_handling,
                                                      size_type ddof,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   auto d_col              = cudf::column_device_view::create(col, stream);
   auto compound_op        = Op{};
@@ -109,7 +111,7 @@ struct compound_float_output_dispatcher {
                                      null_policy null_handling,
                                      size_type ddof,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return compound_segmented_reduction<ElementType, ResultType, Op>(
       col, offsets, null_handling, ddof, stream, mr);
@@ -121,7 +123,7 @@ struct compound_float_output_dispatcher {
                                      null_policy,
                                      size_type,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Unsupported output data type");
   }
@@ -144,7 +146,7 @@ struct compound_segmented_dispatcher {
                                      null_policy null_handling,
                                      size_type ddof,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return cudf::type_dispatcher(output_dtype,
                                  compound_float_output_dispatcher<ElementType, Op>(),
@@ -163,7 +165,7 @@ struct compound_segmented_dispatcher {
                                      null_policy,
                                      size_type,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Compound operators are not supported for non-arithmetic types");
   }
diff --git a/cpp/src/reductions/segmented/counts.cu b/cpp/src/reductions/segmented/counts.cu
index b9064ad3ffe..79737828678 100644
--- a/cpp/src/reductions/segmented/counts.cu
+++ b/cpp/src/reductions/segmented/counts.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/detail/null_mask.cuh>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/adjacent_difference.h>
 
 namespace cudf {
@@ -29,7 +31,7 @@ rmm::device_uvector<size_type> segmented_counts(bitmask_type const* null_mask,
                                                 device_span<size_type const> offsets,
                                                 null_policy null_handling,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   auto const num_segments = offsets.size() - 1;
 
diff --git a/cpp/src/reductions/segmented/counts.hpp b/cpp/src/reductions/segmented/counts.hpp
index c5ee1fadae7..f249644e564 100644
--- a/cpp/src/reductions/segmented/counts.hpp
+++ b/cpp/src/reductions/segmented/counts.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 class column_device_view;
@@ -48,7 +49,7 @@ rmm::device_uvector<size_type> segmented_counts(bitmask_type const* null_mask,
                                                 device_span<size_type const> offsets,
                                                 null_policy null_handling,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr);
+                                                rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace reduction
diff --git a/cpp/src/reductions/segmented/max.cu b/cpp/src/reductions/segmented/max.cu
index c07c8fb2269..1c79edcc08c 100644
--- a/cpp/src/reductions/segmented/max.cu
+++ b/cpp/src/reductions/segmented/max.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -29,7 +31,7 @@ std::unique_ptr<cudf::column> segmented_max(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(col.type() == output_dtype,
                "segmented_max() operation requires matching output type");
diff --git a/cpp/src/reductions/segmented/mean.cu b/cpp/src/reductions/segmented/mean.cu
index 99f1533a154..8df6bee97e9 100644
--- a/cpp/src/reductions/segmented/mean.cu
+++ b/cpp/src/reductions/segmented/mean.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -29,7 +30,7 @@ std::unique_ptr<cudf::column> segmented_mean(column_view const& col,
                                              cudf::data_type const output_dtype,
                                              null_policy null_handling,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   using reducer            = compound::detail::compound_segmented_dispatcher<op::mean>;
   constexpr size_type ddof = 1;  // ddof for mean calculation
diff --git a/cpp/src/reductions/segmented/min.cu b/cpp/src/reductions/segmented/min.cu
index f1597f90267..ae1d5ae42a4 100644
--- a/cpp/src/reductions/segmented/min.cu
+++ b/cpp/src/reductions/segmented/min.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -29,7 +31,7 @@ std::unique_ptr<cudf::column> segmented_min(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(col.type() == output_dtype,
                "segmented_min() operation requires matching output type");
diff --git a/cpp/src/reductions/segmented/nunique.cu b/cpp/src/reductions/segmented/nunique.cu
index bd1efb41df8..d4fcf89e161 100644
--- a/cpp/src/reductions/segmented/nunique.cu
+++ b/cpp/src/reductions/segmented/nunique.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -54,7 +55,7 @@ std::unique_ptr<cudf::column> segmented_nunique(column_view const& col,
                                                 device_span<size_type const> offsets,
                                                 null_policy null_handling,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   // only support non-nested types
   CUDF_EXPECTS(!cudf::is_nested(col.type()),
diff --git a/cpp/src/reductions/segmented/product.cu b/cpp/src/reductions/segmented/product.cu
index ea9c6f484c0..1b82e7e5aec 100644
--- a/cpp/src/reductions/segmented/product.cu
+++ b/cpp/src/reductions/segmented/product.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -28,7 +30,7 @@ std::unique_ptr<cudf::column> segmented_product(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   using reducer = simple::detail::column_type_dispatcher<op::product>;
   return cudf::type_dispatcher(
diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index cee82560794..dee16b3e503 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -35,7 +36,7 @@ struct segmented_reduce_dispatch_functor {
   null_policy null_handling;
   std::optional<std::reference_wrapper<scalar const>> init;
   rmm::cuda_stream_view stream;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 
   segmented_reduce_dispatch_functor(column_view const& segmented_values,
                                     device_span<size_type const> offsets,
@@ -43,7 +44,7 @@ struct segmented_reduce_dispatch_functor {
                                     null_policy null_handling,
                                     std::optional<std::reference_wrapper<scalar const>> init,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
     : col(segmented_values),
       offsets(offsets),
       output_dtype(output_dtype),
@@ -59,7 +60,7 @@ struct segmented_reduce_dispatch_functor {
                                     data_type output_dtype,
                                     null_policy null_handling,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
     : segmented_reduce_dispatch_functor(
         segmented_values, offsets, output_dtype, null_handling, std::nullopt, stream, mr)
   {
@@ -109,7 +110,7 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                                          null_policy null_handling,
                                          std::optional<std::reference_wrapper<scalar const>> init,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!init.has_value() || segmented_values.type() == init.value().get().type(),
                "column and initial value must be the same type");
@@ -135,7 +136,7 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                                          segmented_reduce_aggregation const& agg,
                                          data_type output_dtype,
                                          null_policy null_handling,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return reduction::detail::segmented_reduce(segmented_values,
@@ -154,7 +155,7 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                                          data_type output_dtype,
                                          null_policy null_handling,
                                          std::optional<std::reference_wrapper<scalar const>> init,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return reduction::detail::segmented_reduce(segmented_values,
diff --git a/cpp/src/reductions/segmented/simple.cuh b/cpp/src/reductions/segmented/simple.cuh
index 4d4c6661428..da59df6b314 100644
--- a/cpp/src/reductions/segmented/simple.cuh
+++ b/cpp/src/reductions/segmented/simple.cuh
@@ -33,6 +33,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -71,7 +72,7 @@ std::unique_ptr<column> simple_segmented_reduction(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto dcol               = cudf::column_device_view::create(col, stream);
   auto simple_op          = Op{};
@@ -157,7 +158,7 @@ std::unique_ptr<column> string_segmented_reduction(column_view const& col,
                                                    device_span<size_type const> offsets,
                                                    null_policy null_handling,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   // Pass to simple_segmented_reduction, get indices to gather, perform gather here.
   auto device_col = cudf::column_device_view::create(col, stream);
@@ -201,7 +202,7 @@ std::unique_ptr<column> string_segmented_reduction(column_view const& col,
                                                    device_span<size_type const> offsets,
                                                    null_policy null_handling,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FAIL("Segmented reduction on string column only supports min and max reduction.");
 }
@@ -226,7 +227,7 @@ std::unique_ptr<column> fixed_point_segmented_reduction(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   using RepType = device_storage_type_t<InputType>;
   auto result =
@@ -296,7 +297,7 @@ struct bool_result_column_dispatcher {
                                      null_policy null_handling,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return simple_segmented_reduction<ElementType, bool, Op>(
       col, offsets, null_handling, init, stream, mr);
@@ -308,7 +309,7 @@ struct bool_result_column_dispatcher {
                                      null_policy,
                                      std::optional<std::reference_wrapper<scalar const>>,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Reduction operator not supported for this type");
   }
@@ -341,7 +342,7 @@ struct same_column_type_dispatcher {
                                      null_policy null_handling,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return simple_segmented_reduction<ElementType, ElementType, Op>(
       col, offsets, null_handling, init, stream, mr);
@@ -354,7 +355,7 @@ struct same_column_type_dispatcher {
                                      null_policy null_handling,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     if (init.has_value()) { CUDF_FAIL("Initial value not supported for strings"); }
 
@@ -368,7 +369,7 @@ struct same_column_type_dispatcher {
                                      null_policy null_handling,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return fixed_point_segmented_reduction<ElementType, Op>(
       col, offsets, null_handling, init, stream, mr);
@@ -380,7 +381,7 @@ struct same_column_type_dispatcher {
                                      null_policy,
                                      std::optional<std::reference_wrapper<scalar const>>,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Reduction operator not supported for this type");
   }
@@ -412,7 +413,7 @@ struct column_type_dispatcher {
                                          null_policy null_handling,
                                          std::optional<std::reference_wrapper<scalar const>> init,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
   {
     // Floats are computed in double precision and then cast to the output type
     auto result = simple_segmented_reduction<ElementType, double, Op>(
@@ -439,7 +440,7 @@ struct column_type_dispatcher {
                                          null_policy null_handling,
                                          std::optional<std::reference_wrapper<scalar const>> init,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
   {
     // Integers are computed in int64 precision and then cast to the output type.
     auto result = simple_segmented_reduction<ElementType, int64_t, Op>(
@@ -468,7 +469,7 @@ struct column_type_dispatcher {
                                      null_policy null_handling,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     // If the output type matches the input type, then reduce using that type
     if (output_type.id() == cudf::type_to_id<ElementType>()) {
@@ -486,7 +487,7 @@ struct column_type_dispatcher {
                                      null_policy null_handling,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     CUDF_EXPECTS(output_type == col.type(), "Output type must be same as input column type.");
     return fixed_point_segmented_reduction<ElementType, Op>(
@@ -502,7 +503,7 @@ struct column_type_dispatcher {
                                      null_policy,
                                      std::optional<std::reference_wrapper<scalar const>>,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Reduction operator not supported for this type");
   }
diff --git a/cpp/src/reductions/segmented/std.cu b/cpp/src/reductions/segmented/std.cu
index 5f5ced63b8f..0a7eb007f68 100644
--- a/cpp/src/reductions/segmented/std.cu
+++ b/cpp/src/reductions/segmented/std.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -30,7 +31,7 @@ std::unique_ptr<cudf::column> segmented_standard_deviation(column_view const& co
                                                            null_policy null_handling,
                                                            size_type ddof,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr)
+                                                           rmm::device_async_resource_ref mr)
 {
   using reducer = compound::detail::compound_segmented_dispatcher<op::standard_deviation>;
   return cudf::type_dispatcher(
diff --git a/cpp/src/reductions/segmented/sum.cu b/cpp/src/reductions/segmented/sum.cu
index 7e84961dee0..bb06f6d7c8e 100644
--- a/cpp/src/reductions/segmented/sum.cu
+++ b/cpp/src/reductions/segmented/sum.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -29,7 +31,7 @@ std::unique_ptr<cudf::column> segmented_sum(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   using reducer = simple::detail::column_type_dispatcher<op::sum>;
   return cudf::type_dispatcher(
diff --git a/cpp/src/reductions/segmented/sum_of_squares.cu b/cpp/src/reductions/segmented/sum_of_squares.cu
index 6c3f286fd8d..25d52f9bc79 100644
--- a/cpp/src/reductions/segmented/sum_of_squares.cu
+++ b/cpp/src/reductions/segmented/sum_of_squares.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -29,7 +30,7 @@ std::unique_ptr<cudf::column> segmented_sum_of_squares(column_view const& col,
                                                        cudf::data_type const output_dtype,
                                                        null_policy null_handling,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   using reducer = simple::detail::column_type_dispatcher<op::sum_of_squares>;
   return cudf::type_dispatcher(
diff --git a/cpp/src/reductions/segmented/update_validity.cu b/cpp/src/reductions/segmented/update_validity.cu
index 7bf75d53ada..92cfe5417ef 100644
--- a/cpp/src/reductions/segmented/update_validity.cu
+++ b/cpp/src/reductions/segmented/update_validity.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -30,7 +32,7 @@ void segmented_update_validity(column& result,
                                null_policy null_handling,
                                std::optional<std::reference_wrapper<scalar const>> init,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   auto [output_null_mask, output_null_count] = cudf::detail::segmented_null_mask_reduction(
     col.null_mask(),
diff --git a/cpp/src/reductions/segmented/update_validity.hpp b/cpp/src/reductions/segmented/update_validity.hpp
index 0003b98308a..c143e1a4761 100644
--- a/cpp/src/reductions/segmented/update_validity.hpp
+++ b/cpp/src/reductions/segmented/update_validity.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
@@ -51,7 +52,7 @@ void segmented_update_validity(column& result,
                                null_policy null_handling,
                                std::optional<std::reference_wrapper<scalar const>> init,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace reduction
diff --git a/cpp/src/reductions/segmented/var.cu b/cpp/src/reductions/segmented/var.cu
index 4ac815b542f..35f2771dfcf 100644
--- a/cpp/src/reductions/segmented/var.cu
+++ b/cpp/src/reductions/segmented/var.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -30,7 +31,7 @@ std::unique_ptr<cudf::column> segmented_variance(column_view const& col,
                                                  null_policy null_handling,
                                                  size_type ddof,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   using reducer = compound::detail::compound_segmented_dispatcher<op::variance>;
   return cudf::type_dispatcher(
diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index 006c6dc3034..372ceccf60b 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,6 +32,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -59,7 +60,7 @@ template <typename ElementType, typename ResultType, typename Op>
 std::unique_ptr<scalar> simple_reduction(column_view const& col,
                                          std::optional<std::reference_wrapper<scalar const>> init,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   // reduction by iterator
   auto dcol      = cudf::column_device_view::create(col, stream);
@@ -112,7 +113,7 @@ std::unique_ptr<scalar> fixed_point_reduction(
   column_view const& col,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   using Type = device_storage_type_t<DecimalXX>;
 
@@ -155,7 +156,7 @@ std::unique_ptr<scalar> dictionary_reduction(
   column_view const& col,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   if (init.has_value()) { CUDF_FAIL("Initial value not supported for dictionary reductions"); }
 
@@ -218,7 +219,7 @@ struct cast_numeric_scalar_fn {
   template <typename ResultType, std::enable_if_t<is_supported<ResultType>()>* = nullptr>
   std::unique_ptr<scalar> operator()(numeric_scalar<InputType>* input,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto d_input  = cudf::get_scalar_device_view(*input);
     auto result   = std::make_unique<numeric_scalar<ResultType>>(ResultType{}, true, stream, mr);
@@ -231,7 +232,7 @@ struct cast_numeric_scalar_fn {
   template <typename ResultType, std::enable_if_t<not is_supported<ResultType>()>* = nullptr>
   std::unique_ptr<scalar> operator()(numeric_scalar<InputType>*,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("input data type is not convertible to output data type");
   }
@@ -250,7 +251,7 @@ struct bool_result_element_dispatcher {
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return simple_reduction<ElementType, bool, Op>(col, init, stream, mr);
   }
@@ -260,7 +261,7 @@ struct bool_result_element_dispatcher {
   std::unique_ptr<scalar> operator()(column_view const&,
                                      std::optional<std::reference_wrapper<scalar const>>,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Reduction operator not supported for this type");
   }
@@ -286,7 +287,7 @@ struct same_element_type_dispatcher {
   std::unique_ptr<scalar> resolve_key(column_view const& keys,
                                       scalar const& keys_index,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
   {
     auto& index = static_cast<numeric_scalar<IndexType> const&>(keys_index);
     return cudf::detail::get_element(keys, index.value(stream), stream, mr);
@@ -296,7 +297,7 @@ struct same_element_type_dispatcher {
   std::unique_ptr<scalar> resolve_key(column_view const&,
                                       scalar const&,
                                       rmm::cuda_stream_view,
-                                      rmm::mr::device_memory_resource*)
+                                      rmm::device_async_resource_ref)
   {
     CUDF_FAIL("index type expected for dictionary column");
   }
@@ -309,7 +310,7 @@ struct same_element_type_dispatcher {
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     if (init.has_value()) { CUDF_FAIL("Initial value not supported for nested type reductions"); }
 
@@ -334,7 +335,7 @@ struct same_element_type_dispatcher {
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     if (!cudf::is_dictionary(col.type())) {
       return simple_reduction<ElementType, ElementType, Op>(col, init, stream, mr);
@@ -351,7 +352,7 @@ struct same_element_type_dispatcher {
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return fixed_point_reduction<ElementType, Op>(col, init, stream, mr);
   }
@@ -360,7 +361,7 @@ struct same_element_type_dispatcher {
   std::unique_ptr<scalar> operator()(column_view const&,
                                      std::optional<std::reference_wrapper<scalar const>>,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Reduction operator not supported for this type");
   }
@@ -386,7 +387,7 @@ struct element_type_dispatcher {
                                          data_type const output_type,
                                          std::optional<std::reference_wrapper<scalar const>> init,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
   {
     auto result = !cudf::is_dictionary(col.type())
                     ? simple_reduction<ElementType, double, Op>(col, init, stream, mr)
@@ -409,7 +410,7 @@ struct element_type_dispatcher {
                                          data_type const output_type,
                                          std::optional<std::reference_wrapper<scalar const>> init,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
   {
     auto result = !cudf::is_dictionary(col.type())
                     ? simple_reduction<ElementType, int64_t, Op>(col, init, stream, mr)
@@ -439,7 +440,7 @@ struct element_type_dispatcher {
                                      data_type const output_type,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     if (output_type.id() == cudf::type_to_id<ElementType>())
       return !cudf::is_dictionary(col.type())
@@ -457,7 +458,7 @@ struct element_type_dispatcher {
                                      data_type const output_type,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     CUDF_EXPECTS(output_type == col.type(), "Output type must be same as input column type.");
 
@@ -471,7 +472,7 @@ struct element_type_dispatcher {
                                      data_type const,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Reduction operator not supported for this type");
   }
diff --git a/cpp/src/reductions/std.cu b/cpp/src/reductions/std.cu
index 9df83634667..9c78b35313b 100644
--- a/cpp/src/reductions/std.cu
+++ b/cpp/src/reductions/std.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -29,7 +30,7 @@ std::unique_ptr<cudf::scalar> standard_deviation(column_view const& col,
                                                  cudf::data_type const output_dtype,
                                                  size_type ddof,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   // TODO: add cuda version check when the fix is available
 #if !defined(__CUDACC_DEBUG__)
diff --git a/cpp/src/reductions/sum.cu b/cpp/src/reductions/sum.cu
index 85c6b32dbaf..51b251a836e 100644
--- a/cpp/src/reductions/sum.cu
+++ b/cpp/src/reductions/sum.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -29,7 +30,7 @@ std::unique_ptr<cudf::scalar> sum(column_view const& col,
                                   cudf::data_type const output_dtype,
                                   std::optional<std::reference_wrapper<scalar const>> init,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   return cudf::type_dispatcher(
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type(),
diff --git a/cpp/src/reductions/sum_of_squares.cu b/cpp/src/reductions/sum_of_squares.cu
index 7b85c4e6dc9..dc0eae56e98 100644
--- a/cpp/src/reductions/sum_of_squares.cu
+++ b/cpp/src/reductions/sum_of_squares.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -28,7 +29,7 @@ namespace detail {
 std::unique_ptr<cudf::scalar> sum_of_squares(column_view const& col,
                                              cudf::data_type const output_dtype,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   return cudf::type_dispatcher(
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type(),
diff --git a/cpp/src/reductions/var.cu b/cpp/src/reductions/var.cu
index d559531dc59..aaab9dd4604 100644
--- a/cpp/src/reductions/var.cu
+++ b/cpp/src/reductions/var.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -29,7 +30,7 @@ std::unique_ptr<cudf::scalar> variance(column_view const& col,
                                        cudf::data_type const output_dtype,
                                        size_type ddof,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   // TODO: add cuda version check when the fix is available
 #if !defined(__CUDACC_DEBUG__)
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index 3cd1fdd20a2..fe5a9cfbd71 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -36,6 +36,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -93,7 +94,7 @@ std::unique_ptr<cudf::column> clamp_string_column(strings_column_view const& inp
                                                   OptionalScalarIterator hi_itr,
                                                   ReplaceScalarIterator hi_replace_itr,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   auto input_device_column = column_device_view::create(input.parent(), stream);
   auto d_input             = *input_device_column;
@@ -118,7 +119,7 @@ std::enable_if_t<cudf::is_fixed_width<T>(), std::unique_ptr<cudf::column>> clamp
   OptionalScalarIterator hi_itr,
   ReplaceScalarIterator hi_replace_itr,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto output =
     detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream, mr);
@@ -168,7 +169,7 @@ std::enable_if_t<std::is_same_v<T, string_view>, std::unique_ptr<cudf::column>>
   OptionalScalarIterator hi_itr,
   ReplaceScalarIterator hi_replace_itr,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   return clamp_string_column(input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr, stream, mr);
 }
@@ -182,7 +183,7 @@ std::unique_ptr<column> clamp(column_view const& input,
                               OptionalScalarIterator hi_itr,
                               ReplaceScalarIterator hi_replace_itr,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   return clamper<T>(input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr, stream, mr);
 }
@@ -195,7 +196,7 @@ struct dispatch_clamp {
                                      scalar const& hi,
                                      scalar const& hi_replace,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     CUDF_EXPECTS(lo.type() == input.type(), "mismatching types of scalar and input");
 
@@ -216,7 +217,7 @@ std::unique_ptr<column> dispatch_clamp::operator()<cudf::list_view>(
   scalar const& hi,
   scalar const& hi_replace,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FAIL("clamp for list_view not supported");
 }
@@ -228,7 +229,7 @@ std::unique_ptr<column> dispatch_clamp::operator()<struct_view>(column_view cons
                                                                 scalar const& hi,
                                                                 scalar const& hi_replace,
                                                                 rmm::cuda_stream_view stream,
-                                                                rmm::mr::device_memory_resource* mr)
+                                                                rmm::device_async_resource_ref mr)
 {
   CUDF_FAIL("clamp for struct_view not supported");
 }
@@ -241,7 +242,7 @@ std::unique_ptr<column> dispatch_clamp::operator()<cudf::dictionary32>(
   scalar const& hi,
   scalar const& hi_replace,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   // add lo_replace and hi_replace to keys
   auto matched_column = [&] {
@@ -309,7 +310,7 @@ std::unique_ptr<column> dispatch_clamp::operator()<cudf::dictionary32>(
                                       scalar const& lo_replace,
                                       scalar const& hi,
                                       scalar const& hi_replace,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -319,7 +320,7 @@ std::unique_ptr<column> clamp(column_view const& input,
                               scalar const& hi,
                               scalar const& hi_replace,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(lo.type() == hi.type(), "mismatching types of limit scalars");
   CUDF_EXPECTS(lo_replace.type() == hi_replace.type(), "mismatching types of replace scalars");
@@ -350,7 +351,7 @@ std::unique_ptr<column> clamp(column_view const& input,
                               scalar const& hi,
                               scalar const& hi_replace,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::clamp(input, lo, lo_replace, hi, hi_replace, stream, mr);
@@ -361,7 +362,7 @@ std::unique_ptr<column> clamp(column_view const& input,
                               scalar const& lo,
                               scalar const& hi,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::clamp(input, lo, lo, hi, hi, stream, mr);
diff --git a/cpp/src/replace/nans.cu b/cpp/src/replace/nans.cu
index 2fcb934ba65..eba6f6b436e 100644
--- a/cpp/src/replace/nans.cu
+++ b/cpp/src/replace/nans.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -44,7 +45,7 @@ struct replace_nans_functor {
     Replacement const& replacement,
     bool replacement_nullable,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     CUDF_EXPECTS(input.type() == replacement.type(),
                  "Input and replacement must be of the same type");
@@ -84,7 +85,7 @@ struct replace_nans_functor {
 std::unique_ptr<column> replace_nans(column_view const& input,
                                      column_view const& replacement,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.size() == replacement.size(),
                "Input and replacement must be of the same size");
@@ -101,7 +102,7 @@ std::unique_ptr<column> replace_nans(column_view const& input,
 std::unique_ptr<column> replace_nans(column_view const& input,
                                      scalar const& replacement,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(
     input.type(), replace_nans_functor{}, input, replacement, true, stream, mr);
@@ -112,7 +113,7 @@ std::unique_ptr<column> replace_nans(column_view const& input,
 std::unique_ptr<column> replace_nans(column_view const& input,
                                      column_view const& replacement,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_nans(input, replacement, stream, mr);
@@ -121,7 +122,7 @@ std::unique_ptr<column> replace_nans(column_view const& input,
 std::unique_ptr<column> replace_nans(column_view const& input,
                                      scalar const& replacement,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_nans(input, replacement, stream, mr);
@@ -197,7 +198,7 @@ void normalize_nans_and_zeros(mutable_column_view in_out, rmm::cuda_stream_view
 
 std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   // output. copies the input
   auto out = std::make_unique<column>(input, stream, mr);
@@ -224,7 +225,7 @@ std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
  */
 std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::normalize_nans_and_zeros(input, stream, mr);
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 299cdc6a160..fe3d20e372e 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -42,6 +42,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -113,7 +114,7 @@ struct replace_nulls_column_kernel_forwarder {
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            cudf::column_view const& replacement,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     cudf::size_type nrows = input.size();
     cudf::detail::grid_1d grid{nrows, BLOCK_SIZE};
@@ -152,7 +153,7 @@ struct replace_nulls_column_kernel_forwarder {
   std::unique_ptr<cudf::column> operator()(cudf::column_view const&,
                                            cudf::column_view const&,
                                            rmm::cuda_stream_view,
-                                           rmm::mr::device_memory_resource*)
+                                           rmm::device_async_resource_ref)
   {
     CUDF_FAIL("No specialization exists for the given type.");
   }
@@ -163,7 +164,7 @@ std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<
   cudf::column_view const& input,
   cudf::column_view const& replacement,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto d_input       = cudf::column_device_view::create(input, stream);
   auto d_replacement = cudf::column_device_view::create(replacement, stream);
@@ -190,7 +191,7 @@ std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<
   cudf::column_view const& input,
   cudf::column_view const& replacement,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   cudf::dictionary_column_view dict_input(input);
   cudf::dictionary_column_view dict_repl(replacement);
@@ -213,7 +214,7 @@ struct replace_nulls_scalar_kernel_forwarder {
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            cudf::scalar const& replacement,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch");
     std::unique_ptr<cudf::column> output = cudf::detail::allocate_like(
@@ -238,7 +239,7 @@ struct replace_nulls_scalar_kernel_forwarder {
   std::unique_ptr<cudf::column> operator()(cudf::column_view const&,
                                            cudf::scalar const&,
                                            rmm::cuda_stream_view,
-                                           rmm::mr::device_memory_resource*)
+                                           rmm::device_async_resource_ref)
   {
     CUDF_FAIL("No specialization exists for the given type.");
   }
@@ -249,7 +250,7 @@ std::unique_ptr<cudf::column> replace_nulls_scalar_kernel_forwarder::operator()<
   cudf::column_view const& input,
   cudf::scalar const& replacement,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch");
   cudf::strings_column_view input_s(input);
@@ -262,7 +263,7 @@ std::unique_ptr<cudf::column> replace_nulls_scalar_kernel_forwarder::operator()<
   cudf::column_view const& input,
   cudf::scalar const& replacement,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   cudf::dictionary_column_view dict_input(input);
   return cudf::dictionary::detail::replace_nulls(dict_input, replacement, stream, mr);
@@ -275,7 +276,7 @@ std::unique_ptr<cudf::column> replace_nulls_scalar_kernel_forwarder::operator()<
 std::unique_ptr<cudf::column> replace_nulls_policy_impl(cudf::column_view const& input,
                                                         cudf::replace_policy const& replace_policy,
                                                         rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
   auto device_in = cudf::column_device_view::create(input, stream);
   auto index     = thrust::make_counting_iterator<cudf::size_type>(0);
@@ -315,7 +316,7 @@ namespace detail {
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             cudf::column_view const& replacement,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch");
   CUDF_EXPECTS(replacement.size() == input.size(), "Column size mismatch");
@@ -330,7 +331,7 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             cudf::scalar const& replacement,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return cudf::empty_like(input); }
   if (!input.has_nulls() || !replacement.is_valid(stream)) {
@@ -344,7 +345,7 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             cudf::replace_policy const& replace_policy,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return cudf::empty_like(input); }
   if (!input.has_nulls()) { return std::make_unique<cudf::column>(input, stream, mr); }
@@ -357,7 +358,7 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             cudf::column_view const& replacement,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_nulls(input, replacement, stream, mr);
@@ -366,7 +367,7 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             cudf::scalar const& replacement,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_nulls(input, replacement, stream, mr);
@@ -375,7 +376,7 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
 std::unique_ptr<cudf::column> replace_nulls(column_view const& input,
                                             replace_policy const& replace_policy,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_nulls(input, replace_policy, stream, mr);
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 91a0ced791a..7bc0bd7e0be 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -52,6 +52,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
@@ -178,7 +179,7 @@ struct replace_kernel_forwarder {
                                            cudf::column_view const& values_to_replace,
                                            cudf::column_view const& replacement_values,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     rmm::device_scalar<cudf::size_type> valid_counter(0, stream);
     cudf::size_type* valid_count = valid_counter.data();
@@ -226,7 +227,7 @@ struct replace_kernel_forwarder {
                                            cudf::column_view const&,
                                            cudf::column_view const&,
                                            rmm::cuda_stream_view,
-                                           rmm::mr::device_memory_resource*)
+                                           rmm::device_async_resource_ref)
   {
     CUDF_FAIL("No specialization exists for this type");
   }
@@ -238,7 +239,7 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::string_
   cudf::column_view const& values_to_replace,
   cudf::column_view const& replacement_values,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   return cudf::strings::detail::find_and_replace_all(
     input_col, values_to_replace, replacement_values, stream, mr);
@@ -250,7 +251,7 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::diction
   cudf::column_view const& values_to_replace,
   cudf::column_view const& replacement_values,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto input        = cudf::dictionary_column_view(input_col);
   auto values       = cudf::dictionary_column_view(values_to_replace);
@@ -297,7 +298,7 @@ std::unique_ptr<cudf::column> find_and_replace_all(cudf::column_view const& inpu
                                                    cudf::column_view const& values_to_replace,
                                                    cudf::column_view const& replacement_values,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(values_to_replace.size() == replacement_values.size(),
                "values_to_replace and replacement_values size mismatch.");
@@ -337,7 +338,7 @@ std::unique_ptr<cudf::column> find_and_replace_all(cudf::column_view const& inpu
                                                    cudf::column_view const& values_to_replace,
                                                    cudf::column_view const& replacement_values,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   return detail::find_and_replace_all(input_col, values_to_replace, replacement_values, stream, mr);
 }
diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu
index 6ed28e693fd..1b05a9744fa 100644
--- a/cpp/src/reshape/byte_cast.cu
+++ b/cpp/src/reshape/byte_cast.cu
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/for_each.h>
@@ -56,7 +57,7 @@ struct byte_list_conversion_dispatcher {
   std::unique_ptr<column> operator()(column_view const& input,
                                      flip_endianness configuration,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     return byte_list_conversion_fn<T>::invoke(input, configuration, stream, mr);
   }
@@ -67,7 +68,7 @@ struct byte_list_conversion_fn<T, std::enable_if_t<cudf::is_numeric<T>()>> {
   static std::unique_ptr<column> invoke(column_view const& input,
                                         flip_endianness configuration,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     if (input.size() == 0) {
       return cudf::lists::detail::make_empty_lists_column(output_type, stream, mr);
@@ -124,7 +125,7 @@ struct byte_list_conversion_fn<T, std::enable_if_t<std::is_same_v<T, cudf::strin
   static std::unique_ptr<column> invoke(column_view const& input,
                                         flip_endianness,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     if (input.size() == 0) {
       return cudf::lists::detail::make_empty_lists_column(output_type, stream, mr);
@@ -162,14 +163,14 @@ struct byte_list_conversion_fn<T, std::enable_if_t<std::is_same_v<T, cudf::strin
 }  // namespace
 
 /**
- * @copydoc cudf::byte_cast(column_view const&, flip_endianness, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::byte_cast(column_view const&, flip_endianness, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> byte_cast(column_view const& input,
                                   flip_endianness endian_configuration,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(
     input.type(), byte_list_conversion_dispatcher{}, input, endian_configuration, stream, mr);
@@ -178,11 +179,11 @@ std::unique_ptr<column> byte_cast(column_view const& input,
 }  // namespace detail
 
 /**
- * @copydoc cudf::byte_cast(column_view const&, flip_endianness, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::byte_cast(column_view const&, flip_endianness, rmm::device_async_resource_ref)
  */
 std::unique_ptr<column> byte_cast(column_view const& input,
                                   flip_endianness endian_configuration,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::byte_cast(input, endian_configuration, cudf::get_default_stream(), mr);
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index 72227ab5dda..3d1421120fd 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -55,7 +56,7 @@ struct interleave_columns_functor {
   std::unique_ptr<cudf::column> operator()(table_view const& input,
                                            bool create_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     return interleave_columns_impl<T>{}(input, create_mask, stream, mr);
   }
@@ -66,7 +67,7 @@ struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::list_
   std::unique_ptr<column> operator()(table_view const& lists_columns,
                                      bool create_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return lists::detail::interleave_columns(lists_columns, create_mask, stream, mr);
   }
@@ -77,7 +78,7 @@ struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::struc
   std::unique_ptr<cudf::column> operator()(table_view const& structs_columns,
                                            bool create_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     // We can safely call `column(0)` as the number of columns is known to be non zero.
     auto const num_children = structs_columns.column(0).num_children();
@@ -144,7 +145,7 @@ struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::strin
   std::unique_ptr<cudf::column> operator()(table_view const& strings_columns,
                                            bool create_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     auto num_columns = strings_columns.num_columns();
     if (num_columns == 1)  // Single strings column returns a copy
@@ -226,7 +227,7 @@ struct interleave_columns_impl<T, std::enable_if_t<cudf::is_fixed_width<T>()>> {
   std::unique_ptr<cudf::column> operator()(table_view const& input,
                                            bool create_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     auto arch_column = input.column(0);
     auto output_size = input.num_columns() * input.num_rows();
@@ -273,7 +274,7 @@ struct interleave_columns_impl<T, std::enable_if_t<cudf::is_fixed_width<T>()>> {
 
 std::unique_ptr<column> interleave_columns(table_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.num_columns() > 0, "input must have at least one column to determine dtype.");
 
@@ -293,7 +294,7 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
 }  // namespace detail
 
 std::unique_ptr<column> interleave_columns(table_view const& input,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::interleave_columns(input, cudf::get_default_stream(), mr);
diff --git a/cpp/src/reshape/tile.cu b/cpp/src/reshape/tile.cu
index 9d76c509333..1c4019b2c73 100644
--- a/cpp/src/reshape/tile.cu
+++ b/cpp/src/reshape/tile.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -44,7 +45,7 @@ namespace detail {
 std::unique_ptr<table> tile(table_view const& in,
                             size_type count,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(count >= 0, "Count cannot be negative");
 
@@ -62,7 +63,7 @@ std::unique_ptr<table> tile(table_view const& in,
 
 std::unique_ptr<table> tile(table_view const& in,
                             size_type count,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::tile(in, count, cudf::get_default_stream(), mr);
diff --git a/cpp/src/rolling/detail/lead_lag_nested.cuh b/cpp/src/rolling/detail/lead_lag_nested.cuh
index 66104fe5c77..269868910c7 100644
--- a/cpp/src/rolling/detail/lead_lag_nested.cuh
+++ b/cpp/src/rolling/detail/lead_lag_nested.cuh
@@ -27,6 +27,7 @@
 
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -94,7 +95,7 @@ std::unique_ptr<column> compute_lead_lag_for_nested(aggregation::Kind op,
                                                     FollowingIter following,
                                                     size_type row_offset,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(op == aggregation::LEAD || op == aggregation::LAG,
                "Unexpected aggregation type in compute_lead_lag_for_nested");
diff --git a/cpp/src/rolling/detail/nth_element.cuh b/cpp/src/rolling/detail/nth_element.cuh
index bd3cbb39168..571f4c02cb5 100644
--- a/cpp/src/rolling/detail/nth_element.cuh
+++ b/cpp/src/rolling/detail/nth_element.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/bit.hpp>
 
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/execution_policy.h>
@@ -150,7 +151,7 @@ std::unique_ptr<column> nth_element(size_type n,
                                     FollowingIter following,
                                     size_type min_periods,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto const gather_iter = cudf::detail::make_counting_transform_iterator(
     0,
diff --git a/cpp/src/rolling/detail/optimized_unbounded_window.cpp b/cpp/src/rolling/detail/optimized_unbounded_window.cpp
index f1a5c4c78a8..3e085fa963c 100644
--- a/cpp/src/rolling/detail/optimized_unbounded_window.cpp
+++ b/cpp/src/rolling/detail/optimized_unbounded_window.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,8 @@
 #include <cudf/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf::detail {
 
 bool can_optimize_unbounded_window(bool unbounded_preceding,
@@ -94,7 +96,7 @@ std::unique_ptr<column> aggregation_based_rolling_window(table_view const& group
                                                          column_view const& input,
                                                          rolling_aggregation const& aggr,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr)
+                                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(group_keys.num_columns() > 0,
                "Ungrouped rolling window not supported in aggregation path.");
@@ -127,7 +129,7 @@ std::unique_ptr<column> aggregation_based_rolling_window(table_view const& group
 std::unique_ptr<column> reduction_based_rolling_window(column_view const& input,
                                                        rolling_aggregation const& aggr,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   auto const reduce_results = [&] {
     auto const return_dtype = cudf::detail::target_type(input.type(), aggr.kind);
@@ -152,7 +154,7 @@ std::unique_ptr<column> optimized_unbounded_window(table_view const& group_keys,
                                                    column_view const& input,
                                                    rolling_aggregation const& aggr,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   return group_keys.num_columns() > 0
            ? aggregation_based_rolling_window(group_keys, input, aggr, stream, mr)
diff --git a/cpp/src/rolling/detail/optimized_unbounded_window.hpp b/cpp/src/rolling/detail/optimized_unbounded_window.hpp
index 5964390398c..153586b187f 100644
--- a/cpp/src/rolling/detail/optimized_unbounded_window.hpp
+++ b/cpp/src/rolling/detail/optimized_unbounded_window.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace rmm::mr {
 class device_memory_resource;
@@ -51,6 +52,6 @@ std::unique_ptr<column> optimized_unbounded_window(table_view const& group_keys,
                                                    column_view const& input,
                                                    rolling_aggregation const& aggr,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr);
+                                                   rmm::device_async_resource_ref mr);
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh
index af6d6d7f157..c18bb9d9885 100644
--- a/cpp/src/rolling/detail/rolling.cuh
+++ b/cpp/src/rolling/detail/rolling.cuh
@@ -50,6 +50,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/std/climits>
 #include <cuda/std/limits>
@@ -849,7 +850,7 @@ class rolling_aggregation_postprocessor final : public cudf::detail::aggregation
                                     int _min_periods,
                                     std::unique_ptr<column>&& _intermediate,
                                     rmm::cuda_stream_view _stream,
-                                    rmm::mr::device_memory_resource* _mr)
+                                    rmm::device_async_resource_ref _mr)
     :
 
       input(_input),
@@ -990,7 +991,7 @@ class rolling_aggregation_postprocessor final : public cudf::detail::aggregation
   std::unique_ptr<column> intermediate;
   std::unique_ptr<column> result;
   rmm::cuda_stream_view stream;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 };
 
 /**
@@ -1095,7 +1096,7 @@ struct rolling_window_launcher {
              int min_periods,
              [[maybe_unused]] rolling_aggregation const& agg,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
+             rmm::device_async_resource_ref mr)
   {
     auto const do_rolling = [&](auto const& device_op) {
       auto output = make_fixed_width_column(
@@ -1164,7 +1165,7 @@ struct rolling_window_launcher {
              int,
              rolling_aggregation const&,
              rmm::cuda_stream_view,
-             rmm::mr::device_memory_resource*)
+             rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Invalid aggregation type/pair");
   }
@@ -1188,7 +1189,7 @@ struct dispatch_rolling {
                                      size_type min_periods,
                                      rolling_aggregation const& agg,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     // do any preprocessing of aggregations (eg, MIN -> ARGMIN, COLLECT_LIST -> nothing)
     rolling_aggregation_preprocessor preprocessor;
@@ -1237,7 +1238,7 @@ std::unique_ptr<column> rolling_window_udf(column_view const& input,
                                            size_type min_periods,
                                            rolling_aggregation const& agg,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   static_assert(warp_size == cudf::detail::size_in_bits<cudf::bitmask_type>(),
                 "bitmask_type size does not match CUDA warp size");
@@ -1308,7 +1309,7 @@ std::unique_ptr<column> rolling_window_udf(column_view const& input,
  *                               FollowingWindowIterator following_window_begin,
  *                               size_type min_periods,
  *                               rolling_aggregation const& agg,
- *                               rmm::mr::device_memory_resource* mr)
+ *                               rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -1320,7 +1321,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   static_assert(warp_size == cudf::detail::size_in_bits<cudf::bitmask_type>(),
                 "bitmask_type size does not match CUDA warp size");
diff --git a/cpp/src/rolling/detail/rolling.hpp b/cpp/src/rolling/detail/rolling.hpp
index d2dfa2f9df5..2624d982712 100644
--- a/cpp/src/rolling/detail/rolling.hpp
+++ b/cpp/src/rolling/detail/rolling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/utilities/traits.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 // helper functions - used in the rolling window implementation and tests
 
@@ -64,7 +66,7 @@ struct rolling_store_output_functor<_T, true> {
  *                               size_type following_window,
  *                               size_type min_periods,
  *                               rolling_aggregation const& agg,
- *                               rmm::mr::device_memory_resource* mr)
+ *                               rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream to use for device memory operations
  */
@@ -75,7 +77,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::rolling_window(column_view const& input,
@@ -83,7 +85,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
  *                               column_view const& following_window,
  *                               size_type min_periods,
  *                               rolling_aggregation const& agg,
- *                               rmm::mr::device_memory_resource* mr);
+ *                               rmm::device_async_resource_ref mr);
  *
  * @param stream CUDA stream to use for device memory operations
  */
@@ -93,7 +95,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/src/rolling/detail/rolling_collect_list.cu b/cpp/src/rolling/detail/rolling_collect_list.cu
index 85dced0efe3..b259bd51fc4 100644
--- a/cpp/src/rolling/detail/rolling_collect_list.cu
+++ b/cpp/src/rolling/detail/rolling_collect_list.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/detail/iterator.cuh>
 
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -114,7 +115,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
   column_view const& offsets,
   size_type num_child_nulls,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto input_device_view = column_device_view::create(input, stream);
 
diff --git a/cpp/src/rolling/detail/rolling_collect_list.cuh b/cpp/src/rolling/detail/rolling_collect_list.cuh
index 0ce14792cfa..7630898f820 100644
--- a/cpp/src/rolling/detail/rolling_collect_list.cuh
+++ b/cpp/src/rolling/detail/rolling_collect_list.cuh
@@ -24,6 +24,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/extrema.h>
@@ -50,7 +51,7 @@ std::unique_ptr<column> create_collect_offsets(size_type input_size,
                                                FollowingIter following_begin,
                                                size_type min_periods,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   // Materialize offsets column.
   auto static constexpr size_data_type = data_type{type_to_id<size_type>()};
@@ -148,7 +149,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
   column_view const& offsets,
   size_type num_child_nulls,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 template <typename PrecedingIter, typename FollowingIter>
 std::unique_ptr<column> rolling_collect_list(column_view const& input,
@@ -158,7 +159,7 @@ std::unique_ptr<column> rolling_collect_list(column_view const& input,
                                              size_type min_periods,
                                              null_policy null_handling,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(default_outputs.is_empty(),
                "COLLECT_LIST window function does not support default values.");
diff --git a/cpp/src/rolling/detail/rolling_fixed_window.cu b/cpp/src/rolling/detail/rolling_fixed_window.cu
index f51937f7a0e..df0e72748ce 100644
--- a/cpp/src/rolling/detail/rolling_fixed_window.cu
+++ b/cpp/src/rolling/detail/rolling_fixed_window.cu
@@ -21,6 +21,8 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda/functional>
 #include <thrust/extrema.h>
 
@@ -34,7 +36,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/rolling/detail/rolling_variable_window.cu b/cpp/src/rolling/detail/rolling_variable_window.cu
index bb73f305c7b..83e8faec291 100644
--- a/cpp/src/rolling/detail/rolling_variable_window.cu
+++ b/cpp/src/rolling/detail/rolling_variable_window.cu
@@ -19,6 +19,8 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda/functional>
 #include <thrust/extrema.h>
 #include <thrust/iterator/constant_iterator.h>
@@ -32,7 +34,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index 89a51ad1d87..d461ed7a109 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -29,6 +29,8 @@
 #include <cudf/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda/functional>
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
@@ -44,7 +46,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                                size_type following_window,
                                                size_type min_periods,
                                                rolling_aggregation const& aggr,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   return grouped_rolling_window(group_keys,
                                 input,
@@ -61,7 +63,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                                window_bounds following_window,
                                                size_type min_periods,
                                                rolling_aggregation const& aggr,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   return grouped_rolling_window(group_keys,
                                 input,
@@ -80,7 +82,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                                size_type following_window,
                                                size_type min_periods,
                                                rolling_aggregation const& aggr,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   return grouped_rolling_window(group_keys,
                                 input,
@@ -205,7 +207,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                                size_type min_periods,
                                                rolling_aggregation const& aggr,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -304,7 +306,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                                window_bounds following_window_bounds,
                                                size_type min_periods,
                                                rolling_aggregation const& aggr,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   return detail::grouped_rolling_window(group_keys,
                                         input,
@@ -439,7 +441,7 @@ std::unique_ptr<column> range_window_ASC(column_view const& input,
                                          size_type min_periods,
                                          rolling_aggregation const& aggr,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   auto [h_nulls_begin_idx, h_nulls_end_idx] = get_null_bounds_for_orderby_column(orderby_column);
   auto const p_orderby_device_view = cudf::column_device_view::create(orderby_column, stream);
@@ -614,7 +616,7 @@ std::unique_ptr<column> range_window_ASC(column_view const& input,
                                          size_type min_periods,
                                          rolling_aggregation const& aggr,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   auto [null_start, null_end] =
     get_null_bounds_for_orderby_column(orderby_column, group_offsets, stream);
@@ -728,7 +730,7 @@ std::unique_ptr<column> range_window_DESC(column_view const& input,
                                           size_type min_periods,
                                           rolling_aggregation const& aggr,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   auto [h_nulls_begin_idx, h_nulls_end_idx] = get_null_bounds_for_orderby_column(orderby_column);
   auto const p_orderby_device_view = cudf::column_device_view::create(orderby_column, stream);
@@ -823,7 +825,7 @@ std::unique_ptr<column> range_window_DESC(column_view const& input,
                                           size_type min_periods,
                                           rolling_aggregation const& aggr,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   auto [null_start, null_end] =
     get_null_bounds_for_orderby_column(orderby_column, group_offsets, stream);
@@ -935,7 +937,7 @@ std::unique_ptr<column> grouped_range_rolling_window_impl(
   size_type min_periods,
   rolling_aggregation const& aggr,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto [preceding_value, following_value] = [&] {
     if constexpr (std::is_same_v<OrderByT, cudf::string_view>) {
@@ -1024,7 +1026,7 @@ struct dispatch_grouped_range_rolling_window {
              size_type min_periods,
              rolling_aggregation const& aggr,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr) const
+             rmm::device_async_resource_ref mr) const
   {
     return grouped_range_rolling_window_impl<OrderByColumnType>(input,
                                                                 orderby_column,
@@ -1120,7 +1122,7 @@ namespace detail {
  *               range_window_bounds const& following,
  *               size_type min_periods,
  *               rolling_aggregation const& aggr,
- *               rmm::mr::device_memory_resource* mr );
+ *               rmm::device_async_resource_ref mr );
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -1133,7 +1135,7 @@ std::unique_ptr<column> grouped_range_rolling_window(table_view const& group_key
                                                      size_type min_periods,
                                                      rolling_aggregation const& aggr,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -1187,7 +1189,7 @@ std::unique_ptr<column> grouped_range_rolling_window(table_view const& group_key
  *              size_type following_window_in_days,
  *              size_type min_periods,
  *              rolling_aggregation const& aggr,
- *              rmm::mr::device_memory_resource* mr);
+ *              rmm::device_async_resource_ref mr);
  */
 std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& group_keys,
                                                           column_view const& timestamp_column,
@@ -1197,7 +1199,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
                                                           size_type following_window_in_days,
                                                           size_type min_periods,
                                                           rolling_aggregation const& aggr,
-                                                          rmm::mr::device_memory_resource* mr)
+                                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto preceding = to_range_bounds(preceding_window_in_days, timestamp_column.type());
@@ -1225,7 +1227,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
  *            window_bounds following_window_in_days,
  *            size_type min_periods,
  *            rolling_aggregation const& aggr,
- *            rmm::mr::device_memory_resource* mr);
+ *            rmm::device_async_resource_ref mr);
  */
 std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& group_keys,
                                                           column_view const& timestamp_column,
@@ -1235,7 +1237,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
                                                           window_bounds following_window_in_days,
                                                           size_type min_periods,
                                                           rolling_aggregation const& aggr,
-                                                          rmm::mr::device_memory_resource* mr)
+                                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   range_window_bounds preceding =
@@ -1265,7 +1267,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
  *               range_window_bounds const& following,
  *               size_type min_periods,
  *               rolling_aggregation const& aggr,
- *               rmm::mr::device_memory_resource* mr );
+ *               rmm::device_async_resource_ref mr );
  */
 std::unique_ptr<column> grouped_range_rolling_window(table_view const& group_keys,
                                                      column_view const& timestamp_column,
@@ -1275,7 +1277,7 @@ std::unique_ptr<column> grouped_range_rolling_window(table_view const& group_key
                                                      range_window_bounds const& following,
                                                      size_type min_periods,
                                                      rolling_aggregation const& aggr,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::grouped_range_rolling_window(group_keys,
diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu
index 5c78cc4382d..a308ed8a7a6 100644
--- a/cpp/src/rolling/rolling.cu
+++ b/cpp/src/rolling/rolling.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 
 // Applies a fixed-size rolling window function to the values in a column, with default output
@@ -30,7 +32,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type following_window,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rolling_window(input,
@@ -49,7 +51,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type following_window,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto defaults =
@@ -70,7 +72,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        column_view const& following_window,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rolling_window(
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 8336e1ef2b0..369ed039b66 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -34,6 +34,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 #include <thrust/uninitialized_fill.h>
@@ -213,7 +214,7 @@ template <typename T,
 std::unique_ptr<column> round_with(column_view const& input,
                                    int32_t decimal_places,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   using Functor = RoundFunctor<T>;
 
@@ -245,7 +246,7 @@ template <typename T,
 std::unique_ptr<column> round_with(column_view const& input,
                                    int32_t decimal_places,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   using namespace numeric;
   using Type                   = device_storage_type_t<T>;
@@ -309,7 +310,7 @@ struct round_type_dispatcher {
     int32_t decimal_places,
     cudf::rounding_method method,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     // clang-format off
     switch (method) {
@@ -335,7 +336,7 @@ std::unique_ptr<column> round(column_view const& input,
                               int32_t decimal_places,
                               cudf::rounding_method method,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(cudf::is_numeric(input.type()) || cudf::is_fixed_point(input.type()),
                "Only integral/floating point/fixed point currently supported.");
@@ -357,7 +358,7 @@ std::unique_ptr<column> round(column_view const& input,
 std::unique_ptr<column> round(column_view const& input,
                               int32_t decimal_places,
                               rounding_method method,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::round(input, decimal_places, method, cudf::get_default_stream(), mr);
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 2fa008d9062..07425a92413 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -34,14 +35,12 @@ namespace cudf {
 scalar::scalar(data_type type,
                bool is_valid,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+               rmm::device_async_resource_ref mr)
   : _type(type), _is_valid(is_valid, stream, mr)
 {
 }
 
-scalar::scalar(scalar const& other,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+scalar::scalar(scalar const& other, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   : _type(other.type()), _is_valid(other._is_valid, stream, mr)
 {
 }
@@ -62,7 +61,7 @@ bool const* scalar::validity_data() const { return _is_valid.data(); }
 string_scalar::string_scalar(std::string const& string,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : scalar(data_type(type_id::STRING), is_valid, stream, mr),
     _data(string.data(), string.size(), stream, mr)
 {
@@ -74,7 +73,7 @@ string_scalar::string_scalar(std::string const& string,
 
 string_scalar::string_scalar(string_scalar const& other,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : scalar(other, stream, mr), _data(other._data, stream, mr)
 {
 }
@@ -82,7 +81,7 @@ string_scalar::string_scalar(string_scalar const& other,
 string_scalar::string_scalar(rmm::device_scalar<value_type>& data,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : string_scalar(data.value(stream), is_valid, stream, mr)
 {
 }
@@ -90,7 +89,7 @@ string_scalar::string_scalar(rmm::device_scalar<value_type>& data,
 string_scalar::string_scalar(value_type const& source,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : scalar(data_type(type_id::STRING), is_valid, stream, mr),
     _data(source.data(), source.size_bytes(), stream, mr)
 {
@@ -99,7 +98,7 @@ string_scalar::string_scalar(value_type const& source,
 string_scalar::string_scalar(rmm::device_buffer&& data,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : scalar(data_type(type_id::STRING), is_valid, stream, mr), _data(std::move(data))
 {
 }
@@ -130,7 +129,7 @@ fixed_point_scalar<T>::fixed_point_scalar(rep_type value,
                                           numeric::scale_type scale,
                                           bool is_valid,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
   : scalar{data_type{type_to_id<T>(), static_cast<int32_t>(scale)}, is_valid, stream, mr},
     _data{value, stream, mr}
 {
@@ -140,7 +139,7 @@ template <typename T>
 fixed_point_scalar<T>::fixed_point_scalar(rep_type value,
                                           bool is_valid,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
   : scalar{data_type{type_to_id<T>(), 0}, is_valid, stream, mr}, _data{value, stream, mr}
 {
 }
@@ -149,7 +148,7 @@ template <typename T>
 fixed_point_scalar<T>::fixed_point_scalar(T value,
                                           bool is_valid,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
   : scalar{data_type{type_to_id<T>(), value.scale()}, is_valid, stream, mr},
     _data{value.value(), stream, mr}
 {
@@ -160,7 +159,7 @@ fixed_point_scalar<T>::fixed_point_scalar(rmm::device_scalar<rep_type>&& data,
                                           numeric::scale_type scale,
                                           bool is_valid,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
   : scalar{data_type{type_to_id<T>(), scale}, is_valid, stream, mr}, _data{std::move(data)}
 {
 }
@@ -168,7 +167,7 @@ fixed_point_scalar<T>::fixed_point_scalar(rmm::device_scalar<rep_type>&& data,
 template <typename T>
 fixed_point_scalar<T>::fixed_point_scalar(fixed_point_scalar<T> const& other,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
   : scalar{other, stream, mr}, _data(other._data, stream, mr)
 {
 }
@@ -223,7 +222,7 @@ template <typename T>
 fixed_width_scalar<T>::fixed_width_scalar(T value,
                                           bool is_valid,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
   : scalar(data_type(type_to_id<T>()), is_valid, stream, mr), _data(value, stream, mr)
 {
 }
@@ -232,7 +231,7 @@ template <typename T>
 fixed_width_scalar<T>::fixed_width_scalar(rmm::device_scalar<T>&& data,
                                           bool is_valid,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
   : scalar(data_type(type_to_id<T>()), is_valid, stream, mr), _data{std::move(data)}
 {
 }
@@ -240,7 +239,7 @@ fixed_width_scalar<T>::fixed_width_scalar(rmm::device_scalar<T>&& data,
 template <typename T>
 fixed_width_scalar<T>::fixed_width_scalar(fixed_width_scalar<T> const& other,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
   : scalar{other, stream, mr}, _data(other._data, stream, mr)
 {
 }
@@ -313,7 +312,7 @@ template <typename T>
 numeric_scalar<T>::numeric_scalar(T value,
                                   bool is_valid,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
   : detail::fixed_width_scalar<T>(value, is_valid, stream, mr)
 {
 }
@@ -322,7 +321,7 @@ template <typename T>
 numeric_scalar<T>::numeric_scalar(rmm::device_scalar<T>&& data,
                                   bool is_valid,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
   : detail::fixed_width_scalar<T>(std::forward<rmm::device_scalar<T>>(data), is_valid, stream, mr)
 {
 }
@@ -330,7 +329,7 @@ numeric_scalar<T>::numeric_scalar(rmm::device_scalar<T>&& data,
 template <typename T>
 numeric_scalar<T>::numeric_scalar(numeric_scalar<T> const& other,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
   : detail::fixed_width_scalar<T>{other, stream, mr}
 {
 }
@@ -360,7 +359,7 @@ template <typename T>
 chrono_scalar<T>::chrono_scalar(T value,
                                 bool is_valid,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
   : detail::fixed_width_scalar<T>(value, is_valid, stream, mr)
 {
 }
@@ -369,7 +368,7 @@ template <typename T>
 chrono_scalar<T>::chrono_scalar(rmm::device_scalar<T>&& data,
                                 bool is_valid,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
   : detail::fixed_width_scalar<T>(std::forward<rmm::device_scalar<T>>(data), is_valid, stream, mr)
 {
 }
@@ -377,7 +376,7 @@ chrono_scalar<T>::chrono_scalar(rmm::device_scalar<T>&& data,
 template <typename T>
 chrono_scalar<T>::chrono_scalar(chrono_scalar<T> const& other,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
   : detail::fixed_width_scalar<T>{other, stream, mr}
 {
 }
@@ -405,7 +404,7 @@ template <typename T>
 duration_scalar<T>::duration_scalar(rep_type value,
                                     bool is_valid,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
   : chrono_scalar<T>(T{value}, is_valid, stream, mr)
 {
 }
@@ -413,7 +412,7 @@ duration_scalar<T>::duration_scalar(rep_type value,
 template <typename T>
 duration_scalar<T>::duration_scalar(duration_scalar<T> const& other,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
   : chrono_scalar<T>{other, stream, mr}
 {
 }
@@ -464,7 +463,7 @@ template <typename D>
 timestamp_scalar<T>::timestamp_scalar(D const& value,
                                       bool is_valid,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
   : chrono_scalar<T>(T{typename T::duration{value}}, is_valid, stream, mr)
 {
 }
@@ -472,14 +471,14 @@ timestamp_scalar<T>::timestamp_scalar(D const& value,
 template <typename T>
 timestamp_scalar<T>::timestamp_scalar(timestamp_scalar<T> const& other,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
   : chrono_scalar<T>{other, stream, mr}
 {
 }
 
 #define TS_CTOR(TimestampType, DurationType)                  \
   template timestamp_scalar<TimestampType>::timestamp_scalar( \
-    DurationType const&, bool, rmm::cuda_stream_view, rmm::mr::device_memory_resource*);
+    DurationType const&, bool, rmm::cuda_stream_view, rmm::device_async_resource_ref);
 
 /**
  * @brief These are the valid combinations of duration types to timestamp types.
@@ -508,7 +507,7 @@ TS_CTOR(timestamp_ns, int64_t)
 list_scalar::list_scalar(cudf::column_view const& data,
                          bool is_valid,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
+                         rmm::device_async_resource_ref mr)
   : scalar(data_type(type_id::LIST), is_valid, stream, mr), _data(data, stream, mr)
 {
 }
@@ -516,14 +515,14 @@ list_scalar::list_scalar(cudf::column_view const& data,
 list_scalar::list_scalar(cudf::column&& data,
                          bool is_valid,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
+                         rmm::device_async_resource_ref mr)
   : scalar(data_type(type_id::LIST), is_valid, stream, mr), _data(std::move(data))
 {
 }
 
 list_scalar::list_scalar(list_scalar const& other,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
+                         rmm::device_async_resource_ref mr)
   : scalar{other, stream, mr}, _data(other._data, stream, mr)
 {
 }
@@ -532,7 +531,7 @@ column_view list_scalar::view() const { return _data.view(); }
 
 struct_scalar::struct_scalar(struct_scalar const& other,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : scalar{other, stream, mr}, _data(other._data, stream, mr)
 {
 }
@@ -540,7 +539,7 @@ struct_scalar::struct_scalar(struct_scalar const& other,
 struct_scalar::struct_scalar(table_view const& data,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : scalar(data_type(type_id::STRUCT), is_valid, stream, mr),
     _data{init_data(table{data, stream, mr}, is_valid, stream, mr)}
 {
@@ -550,7 +549,7 @@ struct_scalar::struct_scalar(table_view const& data,
 struct_scalar::struct_scalar(host_span<column_view const> data,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : scalar(data_type(type_id::STRUCT), is_valid, stream, mr),
     _data{
       init_data(table{table_view{std::vector<column_view>{data.begin(), data.end()}}, stream, mr},
@@ -564,7 +563,7 @@ struct_scalar::struct_scalar(host_span<column_view const> data,
 struct_scalar::struct_scalar(table&& data,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : scalar(data_type(type_id::STRUCT), is_valid, stream, mr),
     _data{init_data(std::move(data), is_valid, stream, mr)}
 {
@@ -584,7 +583,7 @@ void struct_scalar::assert_valid_size()
 table struct_scalar::init_data(table&& data,
                                bool is_valid,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   if (is_valid) { return std::move(data); }
 
diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp
index 2336b9075de..d59c5c9fc85 100644
--- a/cpp/src/scalar/scalar_factories.cpp
+++ b/cpp/src/scalar/scalar_factories.cpp
@@ -23,6 +23,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace {
@@ -31,7 +32,7 @@ struct scalar_construction_helper {
             typename ScalarType                                                = scalar_type_t<T>,
             std::enable_if_t<is_fixed_width<T>() and not is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<scalar> operator()(rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     using Type = device_storage_type_t<T>;
     auto s     = new ScalarType(Type{}, false, stream, mr);
@@ -42,7 +43,7 @@ struct scalar_construction_helper {
             typename ScalarType                    = scalar_type_t<T>,
             std::enable_if_t<is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<scalar> operator()(rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     using Type = device_storage_type_t<T>;
     auto s     = new ScalarType(Type{}, numeric::scale_type{0}, false, stream, mr);
@@ -60,7 +61,7 @@ struct scalar_construction_helper {
 // Allocate storage for a single numeric element
 std::unique_ptr<scalar> make_numeric_scalar(data_type type,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type.");
 
@@ -70,7 +71,7 @@ std::unique_ptr<scalar> make_numeric_scalar(data_type type,
 // Allocate storage for a single timestamp element
 std::unique_ptr<scalar> make_timestamp_scalar(data_type type,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.");
 
@@ -80,7 +81,7 @@ std::unique_ptr<scalar> make_timestamp_scalar(data_type type,
 // Allocate storage for a single duration element
 std::unique_ptr<scalar> make_duration_scalar(data_type type,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.");
 
@@ -90,7 +91,7 @@ std::unique_ptr<scalar> make_duration_scalar(data_type type,
 // Allocate storage for a single fixed width element
 std::unique_ptr<scalar> make_fixed_width_scalar(data_type type,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_fixed_width(type), "Invalid, non-fixed-width type.");
 
@@ -99,21 +100,21 @@ std::unique_ptr<scalar> make_fixed_width_scalar(data_type type,
 
 std::unique_ptr<scalar> make_list_scalar(column_view elements,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   return std::make_unique<list_scalar>(elements, true, stream, mr);
 }
 
 std::unique_ptr<scalar> make_struct_scalar(table_view const& data,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   return std::make_unique<struct_scalar>(data, true, stream, mr);
 }
 
 std::unique_ptr<scalar> make_struct_scalar(host_span<column_view const> data,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   return std::make_unique<struct_scalar>(data, true, stream, mr);
 }
@@ -124,14 +125,14 @@ struct default_scalar_functor {
 
   template <typename T, std::enable_if_t<not is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<cudf::scalar> operator()(rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     return make_fixed_width_scalar(data_type(type_to_id<T>()), stream, mr);
   }
 
   template <typename T, std::enable_if_t<is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<cudf::scalar> operator()(rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     auto const scale_ = numeric::scale_type{type.scale()};
     auto s            = make_fixed_point_scalar<T>(0, scale_, stream, mr);
@@ -142,28 +143,28 @@ struct default_scalar_functor {
 
 template <>
 std::unique_ptr<cudf::scalar> default_scalar_functor::operator()<string_view>(
-  rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   return std::unique_ptr<scalar>(new string_scalar("", false, stream, mr));
 }
 
 template <>
 std::unique_ptr<cudf::scalar> default_scalar_functor::operator()<dictionary32>(
-  rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   CUDF_FAIL("dictionary type not supported");
 }
 
 template <>
 std::unique_ptr<cudf::scalar> default_scalar_functor::operator()<list_view>(
-  rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   CUDF_FAIL("list_view type not supported");
 }
 
 template <>
 std::unique_ptr<cudf::scalar> default_scalar_functor::operator()<struct_view>(
-  rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   CUDF_FAIL("struct_view type not supported");
 }
@@ -172,14 +173,14 @@ std::unique_ptr<cudf::scalar> default_scalar_functor::operator()<struct_view>(
 
 std::unique_ptr<scalar> make_default_constructed_scalar(data_type type,
                                                         rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(type, default_scalar_functor{type}, stream, mr);
 }
 
 std::unique_ptr<scalar> make_empty_scalar_like(column_view const& column,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   std::unique_ptr<scalar> result;
   switch (column.type().id()) {
diff --git a/cpp/src/search/contains_column.cu b/cpp/src/search/contains_column.cu
index b8c7d058535..8f05196a71c 100644
--- a/cpp/src/search/contains_column.cu
+++ b/cpp/src/search/contains_column.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -33,7 +34,7 @@ struct contains_column_dispatch {
   std::unique_ptr<column> operator()(column_view const& haystack,
                                      column_view const& needles,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     auto result_v = detail::contains(table_view{{haystack}},
                                      table_view{{needles}},
@@ -51,7 +52,7 @@ std::unique_ptr<column> contains_column_dispatch::operator()<dictionary32>(
   column_view const& haystack_in,
   column_view const& needles_in,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+  rmm::device_async_resource_ref mr) const
 {
   dictionary_column_view const haystack(haystack_in);
   dictionary_column_view const needles(needles_in);
@@ -79,7 +80,7 @@ std::unique_ptr<column> contains_column_dispatch::operator()<dictionary32>(
 std::unique_ptr<column> contains(column_view const& haystack,
                                  column_view const& needles,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   return cudf::type_dispatcher(
     haystack.type(), contains_column_dispatch{}, haystack, needles, stream, mr);
@@ -90,7 +91,7 @@ std::unique_ptr<column> contains(column_view const& haystack,
 std::unique_ptr<column> contains(column_view const& haystack,
                                  column_view const& needles,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::contains(haystack, needles, stream, mr);
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index f7b6d8fdb72..13417fdab63 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_set.cuh>
 #include <cuda/functional>
@@ -187,7 +188,7 @@ rmm::device_uvector<bool> contains(table_view const& haystack,
                                    null_equality compare_nulls,
                                    nan_equality compare_nans,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(cudf::have_same_types(haystack, needles), "Column types mismatch");
 
diff --git a/cpp/src/search/search_ordered.cu b/cpp/src/search/search_ordered.cu
index 3b5dbef0401..328d3f0cee4 100644
--- a/cpp/src/search/search_ordered.cu
+++ b/cpp/src/search/search_ordered.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 
@@ -38,7 +39,7 @@ std::unique_ptr<column> search_ordered(table_view const& haystack,
                                        std::vector<order> const& column_order,
                                        std::vector<null_order> const& null_precedence,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(
     column_order.empty() or static_cast<std::size_t>(haystack.num_columns()) == column_order.size(),
@@ -121,7 +122,7 @@ std::unique_ptr<column> lower_bound(table_view const& haystack,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   return search_ordered(haystack, needles, true, column_order, null_precedence, stream, mr);
 }
@@ -131,7 +132,7 @@ std::unique_ptr<column> upper_bound(table_view const& haystack,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   return search_ordered(haystack, needles, false, column_order, null_precedence, stream, mr);
 }
@@ -145,7 +146,7 @@ std::unique_ptr<column> lower_bound(table_view const& haystack,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::lower_bound(haystack, needles, column_order, null_precedence, stream, mr);
@@ -156,7 +157,7 @@ std::unique_ptr<column> upper_bound(table_view const& haystack,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::upper_bound(haystack, needles, column_order, null_precedence, stream, mr);
diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index cbd0207c20e..c5dcc7c240d 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <cuda/std/type_traits>
@@ -270,7 +271,7 @@ std::unique_ptr<column> rank(column_view const& input,
                              null_order null_precedence,
                              bool percentage,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   data_type const output_type         = (percentage or method == rank_method::AVERAGE)
                                           ? data_type(type_id::FLOAT64)
@@ -373,7 +374,7 @@ std::unique_ptr<column> rank(column_view const& input,
                              null_order null_precedence,
                              bool percentage,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rank(
diff --git a/cpp/src/sort/segmented_sort.cu b/cpp/src/sort/segmented_sort.cu
index d9457341bd2..408ac29b8a9 100644
--- a/cpp/src/sort/segmented_sort.cu
+++ b/cpp/src/sort/segmented_sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -57,7 +58,7 @@ std::unique_ptr<column> segmented_sorted_order(table_view const& keys,
                                                std::vector<order> const& column_order,
                                                std::vector<null_order> const& null_precedence,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   return segmented_sorted_order_common<sort_method::UNSTABLE>(
     keys, segment_offsets, column_order, null_precedence, stream, mr);
@@ -69,7 +70,7 @@ std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
                                              std::vector<order> const& column_order,
                                              std::vector<null_order> const& null_precedence,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   return segmented_sort_by_key_common<sort_method::UNSTABLE>(
     values, keys, segment_offsets, column_order, null_precedence, stream, mr);
@@ -82,7 +83,7 @@ std::unique_ptr<column> segmented_sorted_order(table_view const& keys,
                                                std::vector<order> const& column_order,
                                                std::vector<null_order> const& null_precedence,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_sorted_order(
@@ -95,7 +96,7 @@ std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
                                              std::vector<order> const& column_order,
                                              std::vector<null_order> const& null_precedence,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_sort_by_key(
diff --git a/cpp/src/sort/segmented_sort_impl.cuh b/cpp/src/sort/segmented_sort_impl.cuh
index 796e178fecd..6d472925b30 100644
--- a/cpp/src/sort/segmented_sort_impl.cuh
+++ b/cpp/src/sort/segmented_sort_impl.cuh
@@ -27,6 +27,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/device/device_segmented_sort.cuh>
 
@@ -160,7 +161,7 @@ std::unique_ptr<column> fast_segmented_sorted_order(column_view const& input,
                                                     column_view const& segment_offsets,
                                                     order const& column_order,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   // Unfortunately, CUB's segmented sort functions cannot accept iterators.
   // We have to build a pre-filled sequence of indices as input.
@@ -227,7 +228,7 @@ std::unique_ptr<column> segmented_sorted_order_common(
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   if (keys.num_rows() == 0 || keys.num_columns() == 0) {
     return cudf::make_empty_column(type_to_id<size_type>());
@@ -304,7 +305,7 @@ std::unique_ptr<table> segmented_sort_by_key_common(table_view const& values,
                                                     std::vector<order> const& column_order,
                                                     std::vector<null_order> const& null_precedence,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(values.num_rows() == keys.num_rows(),
                "Mismatch in number of rows for values and keys");
diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu
index adffc06ab93..7216bc99e08 100644
--- a/cpp/src/sort/sort.cu
+++ b/cpp/src/sort/sort.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/sort.h>
@@ -36,7 +37,7 @@ std::unique_ptr<column> sorted_order(table_view const& input,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   return sorted_order<sort_method::UNSTABLE>(input, column_order, null_precedence, stream, mr);
 }
@@ -46,7 +47,7 @@ std::unique_ptr<table> sort_by_key(table_view const& values,
                                    std::vector<order> const& column_order,
                                    std::vector<null_order> const& null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(values.num_rows() == keys.num_rows(),
                "Mismatch in number of rows for values and keys");
@@ -66,7 +67,7 @@ std::unique_ptr<table> sort(table_view const& input,
                             std::vector<order> const& column_order,
                             std::vector<null_order> const& null_precedence,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   // fast-path sort conditions: single, non-floating-point, fixed-width column with no nulls
   if (inplace_column_sort_fn<sort_method::UNSTABLE>::is_usable(input)) {
@@ -88,7 +89,7 @@ std::unique_ptr<column> sorted_order(table_view const& input,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sorted_order(input, column_order, null_precedence, stream, mr);
@@ -98,7 +99,7 @@ std::unique_ptr<table> sort(table_view const& input,
                             std::vector<order> const& column_order,
                             std::vector<null_order> const& null_precedence,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sort(input, column_order, null_precedence, stream, mr);
@@ -109,7 +110,7 @@ std::unique_ptr<table> sort_by_key(table_view const& values,
                                    std::vector<order> const& column_order,
                                    std::vector<null_order> const& null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sort_by_key(values, keys, column_order, null_precedence, stream, mr);
diff --git a/cpp/src/sort/sort_column.cu b/cpp/src/sort/sort_column.cu
index 7db44476988..99a45bf91a3 100644
--- a/cpp/src/sort/sort_column.cu
+++ b/cpp/src/sort/sort_column.cu
@@ -21,6 +21,8 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/sequence.h>
 
 namespace cudf {
@@ -28,14 +30,14 @@ namespace detail {
 
 /**
  * @copydoc
- * sorted_order(column_view&,order,null_order,rmm::cuda_stream_view,rmm::mr::device_memory_resource*)
+ * sorted_order(column_view&,order,null_order,rmm::cuda_stream_view,rmm::device_async_resource_ref)
  */
 template <>
 std::unique_ptr<column> sorted_order<sort_method::UNSTABLE>(column_view const& input,
                                                             order column_order,
                                                             null_order null_precedence,
                                                             rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource* mr)
+                                                            rmm::device_async_resource_ref mr)
 {
   auto sorted_indices = cudf::make_numeric_column(
     data_type(type_to_id<size_type>()), input.size(), mask_state::UNALLOCATED, stream, mr);
diff --git a/cpp/src/sort/sort_column_impl.cuh b/cpp/src/sort/sort_column_impl.cuh
index 7af24f22b67..564791e0b49 100644
--- a/cpp/src/sort/sort_column_impl.cuh
+++ b/cpp/src/sort/sort_column_impl.cuh
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
@@ -52,7 +53,7 @@ std::unique_ptr<column> sorted_order(column_view const& input,
                                      order column_order,
                                      null_order null_precedence,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @brief Comparator functor needed for single column sort.
diff --git a/cpp/src/sort/sort_impl.cuh b/cpp/src/sort/sort_impl.cuh
index e0331d65053..20e977e9fd5 100644
--- a/cpp/src/sort/sort_impl.cuh
+++ b/cpp/src/sort/sort_impl.cuh
@@ -21,12 +21,15 @@
 
 #include <cudf/column/column_factories.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace detail {
 
 /**
  * @copydoc
- * sorted_order(table_view&,std::vector<order>,std::vector<null_order>,rmm::mr::device_memory_resource*)
+ * sorted_order(table_view&,std::vector<order>,std::vector<null_order>,rmm::device_async_resource_ref
+ * )
  *
  * @tparam stable Whether to use stable sort
  * @param stream CUDA stream used for device memory operations and kernel launches
@@ -36,7 +39,7 @@ std::unique_ptr<column> sorted_order(table_view input,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   if (input.num_rows() == 0 or input.num_columns() == 0) {
     return cudf::make_numeric_column(
diff --git a/cpp/src/sort/stable_segmented_sort.cu b/cpp/src/sort/stable_segmented_sort.cu
index 4725d65e05d..61e37205c98 100644
--- a/cpp/src/sort/stable_segmented_sort.cu
+++ b/cpp/src/sort/stable_segmented_sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,8 @@
 #include <cudf/sorting.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace detail {
 
@@ -30,7 +32,7 @@ std::unique_ptr<column> stable_segmented_sorted_order(
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   return segmented_sorted_order_common<sort_method::STABLE>(
     keys, segment_offsets, column_order, null_precedence, stream, mr);
@@ -42,7 +44,7 @@ std::unique_ptr<table> stable_segmented_sort_by_key(table_view const& values,
                                                     std::vector<order> const& column_order,
                                                     std::vector<null_order> const& null_precedence,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   return segmented_sort_by_key_common<sort_method::STABLE>(
     values, keys, segment_offsets, column_order, null_precedence, stream, mr);
@@ -56,7 +58,7 @@ std::unique_ptr<column> stable_segmented_sorted_order(
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_segmented_sorted_order(
@@ -69,7 +71,7 @@ std::unique_ptr<table> stable_segmented_sort_by_key(table_view const& values,
                                                     std::vector<order> const& column_order,
                                                     std::vector<null_order> const& null_precedence,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_segmented_sort_by_key(
diff --git a/cpp/src/sort/stable_sort.cu b/cpp/src/sort/stable_sort.cu
index 0bfe2cfef16..ce05a755756 100644
--- a/cpp/src/sort/stable_sort.cu
+++ b/cpp/src/sort/stable_sort.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -33,7 +34,7 @@ std::unique_ptr<column> stable_sorted_order(table_view const& input,
                                             std::vector<order> const& column_order,
                                             std::vector<null_order> const& null_precedence,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   return sorted_order<sort_method::STABLE>(input, column_order, null_precedence, stream, mr);
 }
@@ -42,7 +43,7 @@ std::unique_ptr<table> stable_sort(table_view const& input,
                                    std::vector<order> const& column_order,
                                    std::vector<null_order> const& null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   if (inplace_column_sort_fn<sort_method::STABLE>::is_usable(input)) {
     auto output = std::make_unique<column>(input.column(0), stream, mr);
@@ -62,7 +63,7 @@ std::unique_ptr<table> stable_sort_by_key(table_view const& values,
                                           std::vector<order> const& column_order,
                                           std::vector<null_order> const& null_precedence,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(values.num_rows() == keys.num_rows(),
                "Mismatch in number of rows for values and keys");
@@ -83,7 +84,7 @@ std::unique_ptr<column> stable_sorted_order(table_view const& input,
                                             std::vector<order> const& column_order,
                                             std::vector<null_order> const& null_precedence,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_sorted_order(input, column_order, null_precedence, stream, mr);
@@ -93,7 +94,7 @@ std::unique_ptr<table> stable_sort(table_view const& input,
                                    std::vector<order> const& column_order,
                                    std::vector<null_order> const& null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_sort(input, column_order, null_precedence, stream, mr);
@@ -104,7 +105,7 @@ std::unique_ptr<table> stable_sort_by_key(table_view const& values,
                                           std::vector<order> const& column_order,
                                           std::vector<null_order> const& null_precedence,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_sort_by_key(values, keys, column_order, null_precedence, stream, mr);
diff --git a/cpp/src/sort/stable_sort_column.cu b/cpp/src/sort/stable_sort_column.cu
index 25a6c92034a..bdb631a8154 100644
--- a/cpp/src/sort/stable_sort_column.cu
+++ b/cpp/src/sort/stable_sort_column.cu
@@ -21,6 +21,8 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/sequence.h>
 
 namespace cudf {
@@ -28,14 +30,14 @@ namespace detail {
 
 /**
  * @copydoc
- * sorted_order(column_view&,order,null_order,rmm::cuda_stream_view,rmm::mr::device_memory_resource*)
+ * sorted_order(column_view&,order,null_order,rmm::cuda_stream_view,rmm::device_async_resource_ref)
  */
 template <>
 std::unique_ptr<column> sorted_order<sort_method::STABLE>(column_view const& input,
                                                           order column_order,
                                                           null_order null_precedence,
                                                           rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource* mr)
+                                                          rmm::device_async_resource_ref mr)
 {
   auto sorted_indices = cudf::make_numeric_column(
     data_type(type_to_id<size_type>()), input.size(), mask_state::UNALLOCATED, stream, mr);
diff --git a/cpp/src/stream_compaction/apply_boolean_mask.cu b/cpp/src/stream_compaction/apply_boolean_mask.cu
index 8f707f6d15d..cdca9517d94 100644
--- a/cpp/src/stream_compaction/apply_boolean_mask.cu
+++ b/cpp/src/stream_compaction/apply_boolean_mask.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <algorithm>
 
@@ -65,7 +66,7 @@ namespace detail {
 std::unique_ptr<table> apply_boolean_mask(table_view const& input,
                                           column_view const& boolean_mask,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   if (boolean_mask.is_empty()) { return empty_like(input); }
 
@@ -90,7 +91,7 @@ std::unique_ptr<table> apply_boolean_mask(table_view const& input,
  */
 std::unique_ptr<table> apply_boolean_mask(table_view const& input,
                                           column_view const& boolean_mask,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::apply_boolean_mask(input, boolean_mask, cudf::get_default_stream(), mr);
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index 11e2e77c253..a6f15cc49ec 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -26,6 +26,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/copy.h>
@@ -44,7 +45,7 @@ rmm::device_uvector<size_type> distinct_indices(table_view const& input,
                                                 null_equality nulls_equal,
                                                 nan_equality nans_equal,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   if (input.num_rows() == 0 or input.num_columns() == 0) {
     return rmm::device_uvector<size_type>(0, stream, mr);
@@ -145,7 +146,7 @@ std::unique_ptr<table> distinct(table_view const& input,
                                 null_equality nulls_equal,
                                 nan_equality nans_equal,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) {
     return empty_like(input);
@@ -172,7 +173,7 @@ std::unique_ptr<table> distinct(table_view const& input,
                                 duplicate_keep_option keep,
                                 null_equality nulls_equal,
                                 nan_equality nans_equal,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::distinct(
@@ -184,7 +185,7 @@ std::unique_ptr<column> distinct_indices(table_view const& input,
                                          null_equality nulls_equal,
                                          nan_equality nans_equal,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto indices = detail::distinct_indices(input, keep, nulls_equal, nans_equal, stream, mr);
diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu
index 8f36ec98f4a..13e89b15bb7 100644
--- a/cpp/src/stream_compaction/distinct_helpers.cu
+++ b/cpp/src/stream_compaction/distinct_helpers.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/detail/hash_reduce_by_row.cuh>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf::detail {
 
 namespace {
@@ -88,7 +90,7 @@ rmm::device_uvector<size_type> reduce_by_row(
   null_equality nulls_equal,
   nan_equality nans_equal,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(keep != duplicate_keep_option::KEEP_ANY,
                "This function should not be called with KEEP_ANY");
diff --git a/cpp/src/stream_compaction/distinct_helpers.hpp b/cpp/src/stream_compaction/distinct_helpers.hpp
index b667d0b04f0..40f97e00ce5 100644
--- a/cpp/src/stream_compaction/distinct_helpers.hpp
+++ b/cpp/src/stream_compaction/distinct_helpers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::detail {
 
@@ -82,6 +83,6 @@ rmm::device_uvector<size_type> reduce_by_row(
   null_equality nulls_equal,
   nan_equality nans_equal,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::detail
diff --git a/cpp/src/stream_compaction/drop_nans.cu b/cpp/src/stream_compaction/drop_nans.cu
index a645b46f7a7..b46381c8ff6 100644
--- a/cpp/src/stream_compaction/drop_nans.cu
+++ b/cpp/src/stream_compaction/drop_nans.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/count.h>
 #include <thrust/execution_policy.h>
@@ -89,7 +90,7 @@ std::unique_ptr<table> drop_nans(table_view const& input,
                                  std::vector<size_type> const& keys,
                                  cudf::size_type keep_threshold,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   auto keys_view = input.select(keys);
   if (keys_view.num_columns() == 0 || keys_view.num_rows() == 0) {
@@ -116,7 +117,7 @@ std::unique_ptr<table> drop_nans(table_view const& input,
 std::unique_ptr<table> drop_nans(table_view const& input,
                                  std::vector<size_type> const& keys,
                                  cudf::size_type keep_threshold,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::drop_nans(input, keys, keep_threshold, cudf::get_default_stream(), mr);
@@ -126,7 +127,7 @@ std::unique_ptr<table> drop_nans(table_view const& input,
  */
 std::unique_ptr<table> drop_nans(table_view const& input,
                                  std::vector<size_type> const& keys,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::drop_nans(input, keys, keys.size(), cudf::get_default_stream(), mr);
diff --git a/cpp/src/stream_compaction/drop_nulls.cu b/cpp/src/stream_compaction/drop_nulls.cu
index 6ea1fd4c31f..cb7cd61bf02 100644
--- a/cpp/src/stream_compaction/drop_nulls.cu
+++ b/cpp/src/stream_compaction/drop_nulls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/count.h>
 #include <thrust/execution_policy.h>
@@ -68,7 +69,7 @@ std::unique_ptr<table> drop_nulls(table_view const& input,
                                   std::vector<size_type> const& keys,
                                   cudf::size_type keep_threshold,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto keys_view = input.select(keys);
   if (keys_view.num_columns() == 0 || keys_view.num_rows() == 0 || not cudf::has_nulls(keys_view)) {
@@ -89,7 +90,7 @@ std::unique_ptr<table> drop_nulls(table_view const& input,
 std::unique_ptr<table> drop_nulls(table_view const& input,
                                   std::vector<size_type> const& keys,
                                   cudf::size_type keep_threshold,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::drop_nulls(input, keys, keep_threshold, cudf::get_default_stream(), mr);
@@ -99,7 +100,7 @@ std::unique_ptr<table> drop_nulls(table_view const& input,
  */
 std::unique_ptr<table> drop_nulls(table_view const& input,
                                   std::vector<size_type> const& keys,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::drop_nulls(input, keys, keys.size(), cudf::get_default_stream(), mr);
diff --git a/cpp/src/stream_compaction/stable_distinct.cu b/cpp/src/stream_compaction/stable_distinct.cu
index 63167b45b2d..27b5a92ab69 100644
--- a/cpp/src/stream_compaction/stable_distinct.cu
+++ b/cpp/src/stream_compaction/stable_distinct.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,8 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/scatter.h>
 #include <thrust/uninitialized_fill.h>
@@ -34,7 +36,7 @@ std::unique_ptr<table> stable_distinct(table_view const& input,
                                        null_equality nulls_equal,
                                        nan_equality nans_equal,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) {
     return empty_like(input);
@@ -77,7 +79,7 @@ std::unique_ptr<table> stable_distinct(table_view const& input,
                                        duplicate_keep_option keep,
                                        null_equality nulls_equal,
                                        nan_equality nans_equal,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_distinct(
diff --git a/cpp/src/stream_compaction/unique.cu b/cpp/src/stream_compaction/unique.cu
index db67daaa324..c1f8b17938c 100644
--- a/cpp/src/stream_compaction/unique.cu
+++ b/cpp/src/stream_compaction/unique.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,6 +36,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -52,7 +53,7 @@ std::unique_ptr<table> unique(table_view const& input,
                               duplicate_keep_option keep,
                               null_equality nulls_equal,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   // If keep is KEEP_ANY, just alias it to KEEP_FIRST.
   if (keep == duplicate_keep_option::KEEP_ANY) { keep = duplicate_keep_option::KEEP_FIRST; }
@@ -119,7 +120,7 @@ std::unique_ptr<table> unique(table_view const& input,
                               std::vector<size_type> const& keys,
                               duplicate_keep_option const keep,
                               null_equality nulls_equal,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::unique(input, keys, keep, nulls_equal, cudf::get_default_stream(), mr);
diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu
index 073ed74d8c9..778f546990d 100644
--- a/cpp/src/strings/attributes.cu
+++ b/cpp/src/strings/attributes.cu
@@ -30,6 +30,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/warp/warp_reduce.cuh>
 #include <cuda/functional>
@@ -75,7 +76,7 @@ template <typename UnaryFunction>
 std::unique_ptr<column> counts_fn(strings_column_view const& strings,
                                   UnaryFunction& ufn,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   // create output column
   auto results   = make_numeric_column(data_type{type_to_id<size_type>()},
@@ -136,7 +137,7 @@ CUDF_KERNEL void count_characters_parallel_fn(column_device_view const d_strings
 
 std::unique_ptr<column> count_characters_parallel(strings_column_view const& input,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   // create output column
   auto results = make_numeric_column(data_type{type_to_id<size_type>()},
@@ -165,7 +166,7 @@ std::unique_ptr<column> count_characters_parallel(strings_column_view const& inp
 
 std::unique_ptr<column> count_characters(strings_column_view const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   if ((input.size() == input.null_count()) ||
       ((input.chars_size(stream) / (input.size() - input.null_count())) <
@@ -180,7 +181,7 @@ std::unique_ptr<column> count_characters(strings_column_view const& input,
 
 std::unique_ptr<column> count_bytes(strings_column_view const& input,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto ufn = cuda::proclaim_return_type<size_type>(
     [] __device__(string_view const& d_str) { return d_str.size_bytes(); });
@@ -219,7 +220,7 @@ namespace detail {
 //
 std::unique_ptr<column> code_points(strings_column_view const& input,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_column       = *strings_column;
@@ -263,21 +264,21 @@ std::unique_ptr<column> code_points(strings_column_view const& input,
 // external APIS
 
 std::unique_ptr<column> count_characters(strings_column_view const& input,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::count_characters(input, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> count_bytes(strings_column_view const& input,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::count_bytes(input, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> code_points(strings_column_view const& input,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::code_points(input, cudf::get_default_stream(), mr);
diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu
index 3889bd31b4d..2bb85bf2c5c 100644
--- a/cpp/src/strings/capitalize.cu
+++ b/cpp/src/strings/capitalize.cu
@@ -27,6 +27,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/pair.h>
@@ -227,7 +228,7 @@ template <typename CapitalFn>
 std::unique_ptr<column> capitalizer(CapitalFn cfn,
                                     strings_column_view const& input,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto [offsets_column, chars] =
     cudf::strings::detail::make_strings_children(cfn, input.size(), stream, mr);
@@ -244,7 +245,7 @@ std::unique_ptr<column> capitalizer(CapitalFn cfn,
 std::unique_ptr<column> capitalize(strings_column_view const& input,
                                    string_scalar const& delimiters,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiters.is_valid(stream), "Delimiter must be a valid string");
   if (input.is_empty()) return make_empty_column(type_id::STRING);
@@ -256,7 +257,7 @@ std::unique_ptr<column> capitalize(strings_column_view const& input,
 std::unique_ptr<column> title(strings_column_view const& input,
                               string_character_types sequence_type,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
   auto d_column = column_device_view::create(input.parent(), stream);
@@ -265,7 +266,7 @@ std::unique_ptr<column> title(strings_column_view const& input,
 
 std::unique_ptr<column> is_title(strings_column_view const& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::BOOL8);
   auto results  = make_numeric_column(data_type{type_id::BOOL8},
@@ -289,7 +290,7 @@ std::unique_ptr<column> is_title(strings_column_view const& input,
 std::unique_ptr<column> capitalize(strings_column_view const& input,
                                    string_scalar const& delimiter,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::capitalize(input, delimiter, stream, mr);
@@ -298,7 +299,7 @@ std::unique_ptr<column> capitalize(strings_column_view const& input,
 std::unique_ptr<column> title(strings_column_view const& input,
                               string_character_types sequence_type,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::title(input, sequence_type, stream, mr);
@@ -306,7 +307,7 @@ std::unique_ptr<column> title(strings_column_view const& input,
 
 std::unique_ptr<column> is_title(strings_column_view const& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_title(input, stream, mr);
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index a7fd244f8a5..82b590f81b3 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -32,6 +32,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <cuda/functional>
@@ -271,7 +272,7 @@ struct ascii_converter_fn {
 std::unique_ptr<column> convert_case(strings_column_view const& input,
                                      character_flags_table_type case_flag,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   if (input.size() == input.null_count()) {
     return std::make_unique<column>(input.parent(), stream, mr);
@@ -377,7 +378,7 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
 
 std::unique_ptr<column> to_lower(strings_column_view const& strings,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   character_flags_table_type case_flag = IS_UPPER(0xFF);  // convert only upper case characters
   return convert_case(strings, case_flag, stream, mr);
@@ -386,7 +387,7 @@ std::unique_ptr<column> to_lower(strings_column_view const& strings,
 //
 std::unique_ptr<column> to_upper(strings_column_view const& strings,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   character_flags_table_type case_flag = IS_LOWER(0xFF);  // convert only lower case characters
   return convert_case(strings, case_flag, stream, mr);
@@ -395,7 +396,7 @@ std::unique_ptr<column> to_upper(strings_column_view const& strings,
 //
 std::unique_ptr<column> swapcase(strings_column_view const& strings,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   // convert only upper or lower case characters
   character_flags_table_type case_flag = IS_LOWER(0xFF) | IS_UPPER(0xFF);
@@ -408,7 +409,7 @@ std::unique_ptr<column> swapcase(strings_column_view const& strings,
 
 std::unique_ptr<column> to_lower(strings_column_view const& strings,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_lower(strings, stream, mr);
@@ -416,7 +417,7 @@ std::unique_ptr<column> to_lower(strings_column_view const& strings,
 
 std::unique_ptr<column> to_upper(strings_column_view const& strings,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_upper(strings, stream, mr);
@@ -424,7 +425,7 @@ std::unique_ptr<column> to_upper(strings_column_view const& strings,
 
 std::unique_ptr<column> swapcase(strings_column_view const& strings,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::swapcase(strings, stream, mr);
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index b8c0dfd27e6..28068cf7e78 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -29,6 +29,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -87,7 +88,7 @@ std::unique_ptr<column> all_characters_of_type(strings_column_view const& input,
                                                string_character_types types,
                                                string_character_types verify_types,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   auto d_strings = column_device_view::create(input.parent(), stream);
 
@@ -175,7 +176,7 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
                                                   string_scalar const& replacement,
                                                   string_character_types types_to_keep,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(replacement.is_valid(stream), "Parameter replacement must be valid");
   if (types_to_remove == ALL_TYPES)
@@ -219,7 +220,7 @@ std::unique_ptr<column> all_characters_of_type(strings_column_view const& input,
                                                string_character_types types,
                                                string_character_types verify_types,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::all_characters_of_type(input, types, verify_types, stream, mr);
@@ -230,7 +231,7 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& inp
                                                   string_scalar const& replacement,
                                                   string_character_types types_to_keep,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::filter_characters_of_type(
diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu
index 14f530971f5..33d2de3cd07 100644
--- a/cpp/src/strings/combine/concatenate.cu
+++ b/cpp/src/strings/combine/concatenate.cu
@@ -32,6 +32,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -122,7 +123,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& narep,
                                     separator_on_nulls separate_nulls,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto const num_columns = strings_columns.num_columns();
   CUDF_EXPECTS(num_columns > 1, "At least two columns must be specified");
@@ -206,7 +207,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& col_narep,
                                     separator_on_nulls separate_nulls,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto const num_columns = strings_columns.num_columns();
   CUDF_EXPECTS(num_columns > 0, "At least one column must be specified");
@@ -262,7 +263,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& narep,
                                     separator_on_nulls separate_nulls,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::concatenate(strings_columns, separator, narep, separate_nulls, stream, mr);
@@ -274,7 +275,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& col_narep,
                                     separator_on_nulls separate_nulls,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::concatenate(
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index c6290ceb6c2..d1d9afbb85f 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -32,6 +32,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -131,7 +132,7 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
                                      string_scalar const& separator,
                                      string_scalar const& narep,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return make_empty_column(type_id::STRING); }
 
@@ -191,7 +192,7 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
                                      string_scalar const& separator,
                                      string_scalar const& narep,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::join_strings(strings, separator, narep, stream, mr);
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
index 170e621e05c..a54ea5263fe 100644
--- a/cpp/src/strings/combine/join_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -178,7 +179,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                            separator_on_nulls separate_nulls,
                                            output_if_empty_list empty_list_policy,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING,
                "The input column must be a column of lists of strings");
@@ -251,7 +252,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                            separator_on_nulls separate_nulls,
                                            output_if_empty_list empty_list_policy,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING,
                "The input column must be a column of lists of strings");
@@ -302,7 +303,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                            separator_on_nulls separate_nulls,
                                            output_if_empty_list empty_list_policy,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::join_list_elements(
@@ -316,7 +317,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                            separator_on_nulls separate_nulls,
                                            output_if_empty_list empty_list_policy,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::join_list_elements(lists_strings_column,
diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
index 3f0ebc5962b..718ac41e36c 100644
--- a/cpp/src/strings/contains.cu
+++ b/cpp/src/strings/contains.cu
@@ -29,6 +29,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -60,7 +61,7 @@ std::unique_ptr<column> contains_impl(strings_column_view const& input,
                                       regex_program const& prog,
                                       bool const beginning_only,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   auto results = make_numeric_column(data_type{type_id::BOOL8},
                                      input.size(),
@@ -88,7 +89,7 @@ std::unique_ptr<column> contains_impl(strings_column_view const& input,
 std::unique_ptr<column> contains_re(strings_column_view const& input,
                                     regex_program const& prog,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   return contains_impl(input, prog, false, stream, mr);
 }
@@ -96,7 +97,7 @@ std::unique_ptr<column> contains_re(strings_column_view const& input,
 std::unique_ptr<column> matches_re(strings_column_view const& input,
                                    regex_program const& prog,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   return contains_impl(input, prog, true, stream, mr);
 }
@@ -104,7 +105,7 @@ std::unique_ptr<column> matches_re(strings_column_view const& input,
 std::unique_ptr<column> count_re(strings_column_view const& input,
                                  regex_program const& prog,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   // create device object from regex_program
   auto d_prog = regex_device_builder::create_prog_device(prog, stream);
@@ -126,7 +127,7 @@ std::unique_ptr<column> count_re(strings_column_view const& input,
 std::unique_ptr<column> contains_re(strings_column_view const& input,
                                     regex_program const& prog,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::contains_re(input, prog, stream, mr);
@@ -135,7 +136,7 @@ std::unique_ptr<column> contains_re(strings_column_view const& input,
 std::unique_ptr<column> matches_re(strings_column_view const& input,
                                    regex_program const& prog,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::matches_re(input, prog, stream, mr);
@@ -144,7 +145,7 @@ std::unique_ptr<column> matches_re(strings_column_view const& input,
 std::unique_ptr<column> count_re(strings_column_view const& input,
                                  regex_program const& prog,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::count_re(input, prog, stream, mr);
diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu
index d1de345a709..bf73800ad06 100644
--- a/cpp/src/strings/convert/convert_booleans.cu
+++ b/cpp/src/strings/convert/convert_booleans.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -42,7 +43,7 @@ namespace detail {
 std::unique_ptr<column> to_booleans(strings_column_view const& input,
                                     string_scalar const& true_string,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   size_type strings_count = input.size();
   if (strings_count == 0) {
@@ -85,7 +86,7 @@ std::unique_ptr<column> to_booleans(strings_column_view const& input,
 std::unique_ptr<column> to_booleans(strings_column_view const& input,
                                     string_scalar const& true_string,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_booleans(input, true_string, stream, mr);
@@ -123,7 +124,7 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
                                       string_scalar const& true_string,
                                       string_scalar const& false_string,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   size_type strings_count = booleans.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -160,7 +161,7 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
                                       string_scalar const& true_string,
                                       string_scalar const& false_string,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_booleans(booleans, true_string, false_string, stream, mr);
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index f54eb082959..d6449fbb6c8 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -34,6 +34,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -437,7 +438,7 @@ std::unique_ptr<cudf::column> to_timestamps(strings_column_view const& input,
                                             data_type timestamp_type,
                                             std::string_view format,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   if (input.is_empty())
     return make_empty_column(timestamp_type);  // make_timestamp_column(timestamp_type, 0);
@@ -675,7 +676,7 @@ struct check_datetime_format {
 std::unique_ptr<cudf::column> is_timestamp(strings_column_view const& input,
                                            std::string_view const& format,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   size_type strings_count = input.size();
   if (strings_count == 0) return make_empty_column(type_id::BOOL8);
@@ -711,7 +712,7 @@ std::unique_ptr<cudf::column> to_timestamps(strings_column_view const& input,
                                             data_type timestamp_type,
                                             std::string_view format,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_timestamps(input, timestamp_type, format, stream, mr);
@@ -720,7 +721,7 @@ std::unique_ptr<cudf::column> to_timestamps(strings_column_view const& input,
 std::unique_ptr<cudf::column> is_timestamp(strings_column_view const& input,
                                            std::string_view format,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_timestamp(input, format, stream, mr);
@@ -1106,7 +1107,7 @@ struct dispatch_from_timestamps_fn {
                               column_device_view const& d_format_names,
                               device_span<format_item const> d_format_items,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr) const
+                              rmm::device_async_resource_ref mr) const
   {
     return make_strings_children(
       datetime_formatter_fn<T>{d_timestamps, d_format_names, d_format_items},
@@ -1129,7 +1130,7 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
                                         std::string_view format,
                                         strings_column_view const& names,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   if (timestamps.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -1171,7 +1172,7 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
                                         std::string_view format,
                                         strings_column_view const& names,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_timestamps(timestamps, format, names, stream, mr);
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 8076c5c484b..77c750848cf 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -400,7 +401,7 @@ struct dispatch_from_durations_fn {
   std::unique_ptr<column> operator()(column_view const& durations,
                                      std::string_view format,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty.");
 
@@ -681,7 +682,7 @@ struct dispatch_to_durations_fn {
 std::unique_ptr<column> from_durations(column_view const& durations,
                                        std::string_view format,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   size_type strings_count = durations.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -694,7 +695,7 @@ std::unique_ptr<column> to_durations(strings_column_view const& input,
                                      data_type duration_type,
                                      std::string_view format,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   size_type strings_count = input.size();
   if (strings_count == 0) {
@@ -724,7 +725,7 @@ std::unique_ptr<column> to_durations(strings_column_view const& input,
 std::unique_ptr<column> from_durations(column_view const& durations,
                                        std::string_view format,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_durations(durations, format, stream, mr);
@@ -734,7 +735,7 @@ std::unique_ptr<column> to_durations(strings_column_view const& input,
                                      data_type duration_type,
                                      std::string_view format,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_durations(input, duration_type, format, stream, mr);
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index fb8ebf55ef1..446baa8dea9 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -31,6 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/std/climits>
 #include <cuda/std/limits>
@@ -133,7 +134,7 @@ struct dispatch_to_fixed_point_fn {
   std::unique_ptr<column> operator()(strings_column_view const& input,
                                      data_type output_type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     using DecimalType = device_storage_type_t<T>;
 
@@ -162,7 +163,7 @@ struct dispatch_to_fixed_point_fn {
   std::unique_ptr<column> operator()(strings_column_view const&,
                                      data_type,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("Output for to_fixed_point must be a decimal type.");
   }
@@ -174,7 +175,7 @@ struct dispatch_to_fixed_point_fn {
 std::unique_ptr<column> to_fixed_point(strings_column_view const& input,
                                        data_type output_type,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(output_type);
   return type_dispatcher(output_type, dispatch_to_fixed_point_fn{}, input, output_type, stream, mr);
@@ -186,7 +187,7 @@ std::unique_ptr<column> to_fixed_point(strings_column_view const& input,
 std::unique_ptr<column> to_fixed_point(strings_column_view const& input,
                                        data_type output_type,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_fixed_point(input, output_type, stream, mr);
@@ -237,7 +238,7 @@ struct dispatch_from_fixed_point_fn {
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& input,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     using DecimalType = device_storage_type_t<T>;  // underlying value type
 
@@ -256,7 +257,7 @@ struct dispatch_from_fixed_point_fn {
   template <typename T, std::enable_if_t<not cudf::is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("Values for from_fixed_point function must be a decimal type.");
   }
@@ -266,7 +267,7 @@ struct dispatch_from_fixed_point_fn {
 
 std::unique_ptr<column> from_fixed_point(column_view const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
   return type_dispatcher(input.type(), dispatch_from_fixed_point_fn{}, input, stream, mr);
@@ -278,7 +279,7 @@ std::unique_ptr<column> from_fixed_point(column_view const& input,
 
 std::unique_ptr<column> from_fixed_point(column_view const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_fixed_point(input, stream, mr);
@@ -292,7 +293,7 @@ struct dispatch_is_fixed_point_fn {
   std::unique_ptr<column> operator()(strings_column_view const& input,
                                      data_type decimal_type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     using DecimalType = device_storage_type_t<T>;
 
@@ -321,7 +322,7 @@ struct dispatch_is_fixed_point_fn {
   std::unique_ptr<column> operator()(strings_column_view const&,
                                      data_type,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("is_fixed_point is expecting a decimal type");
   }
@@ -332,7 +333,7 @@ struct dispatch_is_fixed_point_fn {
 std::unique_ptr<column> is_fixed_point(strings_column_view const& input,
                                        data_type decimal_type,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return cudf::make_empty_column(type_id::BOOL8);
   return type_dispatcher(
@@ -343,7 +344,7 @@ std::unique_ptr<column> is_fixed_point(strings_column_view const& input,
 std::unique_ptr<column> is_fixed_point(strings_column_view const& input,
                                        data_type decimal_type,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_fixed_point(input, decimal_type, stream, mr);
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index df019ca236a..c6061f7d8e6 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -94,7 +95,7 @@ struct dispatch_to_floats_fn {
 std::unique_ptr<column> to_floats(strings_column_view const& input,
                                   data_type output_type,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   size_type strings_count = input.size();
   if (strings_count == 0) {
@@ -123,7 +124,7 @@ std::unique_ptr<column> to_floats(strings_column_view const& input,
 std::unique_ptr<column> to_floats(strings_column_view const& input,
                                   data_type output_type,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_floats(input, output_type, stream, mr);
@@ -394,7 +395,7 @@ struct dispatch_from_floats_fn {
   template <typename FloatType, std::enable_if_t<std::is_floating_point_v<FloatType>>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& floats,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     size_type strings_count = floats.size();
     auto column             = column_device_view::create(floats, stream);
@@ -417,7 +418,7 @@ struct dispatch_from_floats_fn {
   template <typename T, std::enable_if_t<not std::is_floating_point_v<T>>* = nullptr>
   std::unique_ptr<column> operator()(column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("Values for from_floats function must be a float type.");
   }
@@ -428,7 +429,7 @@ struct dispatch_from_floats_fn {
 // This will convert all float column types into a strings column.
 std::unique_ptr<column> from_floats(column_view const& floats,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   size_type strings_count = floats.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -441,7 +442,7 @@ std::unique_ptr<column> from_floats(column_view const& floats,
 // external API
 std::unique_ptr<column> from_floats(column_view const& floats,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_floats(floats, stream, mr);
@@ -450,7 +451,7 @@ std::unique_ptr<column> from_floats(column_view const& floats,
 namespace detail {
 std::unique_ptr<column> is_float(strings_column_view const& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_column       = *strings_column;
@@ -480,7 +481,7 @@ std::unique_ptr<column> is_float(strings_column_view const& input,
 // external API
 std::unique_ptr<column> is_float(strings_column_view const& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_float(input, stream, mr);
diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu
index 332bc9837c1..95af378fc3f 100644
--- a/cpp/src/strings/convert/convert_hex.cu
+++ b/cpp/src/strings/convert/convert_hex.cu
@@ -29,6 +29,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
@@ -176,7 +177,7 @@ struct dispatch_integers_to_hex_fn {
             std::enable_if_t<cudf::is_integral_not_bool<IntegerType>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& input,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     auto const d_column = column_device_view::create(input, stream);
 
@@ -204,7 +205,7 @@ struct dispatch_integers_to_hex_fn {
 std::unique_ptr<column> hex_to_integers(strings_column_view const& strings,
                                         data_type output_type,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(output_type);
@@ -226,7 +227,7 @@ std::unique_ptr<column> hex_to_integers(strings_column_view const& strings,
 
 std::unique_ptr<column> is_hex(strings_column_view const& strings,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_column       = *strings_column;
@@ -264,7 +265,7 @@ std::unique_ptr<column> is_hex(strings_column_view const& strings,
 
 std::unique_ptr<column> integers_to_hex(column_view const& input,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return cudf::make_empty_column(type_id::STRING); }
   return type_dispatcher(input.type(), dispatch_integers_to_hex_fn{}, input, stream, mr);
@@ -276,7 +277,7 @@ std::unique_ptr<column> integers_to_hex(column_view const& input,
 std::unique_ptr<column> hex_to_integers(strings_column_view const& strings,
                                         data_type output_type,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::hex_to_integers(strings, output_type, stream, mr);
@@ -284,7 +285,7 @@ std::unique_ptr<column> hex_to_integers(strings_column_view const& strings,
 
 std::unique_ptr<column> is_hex(strings_column_view const& strings,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_hex(strings, stream, mr);
@@ -292,7 +293,7 @@ std::unique_ptr<column> is_hex(strings_column_view const& strings,
 
 std::unique_ptr<column> integers_to_hex(column_view const& input,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::integers_to_hex(input, stream, mr);
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index eb2e9c28134..f3e639817a6 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -31,6 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -114,7 +115,7 @@ struct dispatch_is_integer_fn {
   template <typename T, std::enable_if_t<cudf::is_integral_not_bool<T>()>* = nullptr>
   std::unique_ptr<column> operator()(strings_column_view const& input,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     auto const d_column = column_device_view::create(input.parent(), stream);
     auto results        = make_numeric_column(data_type{type_id::BOOL8},
@@ -148,7 +149,7 @@ struct dispatch_is_integer_fn {
   template <typename T, std::enable_if_t<not cudf::is_integral_not_bool<T>()>* = nullptr>
   std::unique_ptr<column> operator()(strings_column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("is_integer is expecting an integer type");
   }
@@ -158,7 +159,7 @@ struct dispatch_is_integer_fn {
 
 std::unique_ptr<column> is_integer(strings_column_view const& input,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   auto const d_column = column_device_view::create(input.parent(), stream);
   auto results        = make_numeric_column(data_type{type_id::BOOL8},
@@ -193,7 +194,7 @@ std::unique_ptr<column> is_integer(strings_column_view const& input,
 std::unique_ptr<column> is_integer(strings_column_view const& input,
                                    data_type int_type,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return cudf::make_empty_column(type_id::BOOL8); }
   return type_dispatcher(int_type, dispatch_is_integer_fn{}, input, stream, mr);
@@ -204,7 +205,7 @@ std::unique_ptr<column> is_integer(strings_column_view const& input,
 // external APIs
 std::unique_ptr<column> is_integer(strings_column_view const& input,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_integer(input, stream, mr);
@@ -213,7 +214,7 @@ std::unique_ptr<column> is_integer(strings_column_view const& input,
 std::unique_ptr<column> is_integer(strings_column_view const& input,
                                    data_type int_type,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_integer(input, int_type, stream, mr);
@@ -271,7 +272,7 @@ struct dispatch_to_integers_fn {
 std::unique_ptr<column> to_integers(strings_column_view const& input,
                                     data_type output_type,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   size_type strings_count = input.size();
   if (strings_count == 0) {
@@ -302,7 +303,7 @@ std::unique_ptr<column> to_integers(strings_column_view const& input,
 std::unique_ptr<column> to_integers(strings_column_view const& input,
                                     data_type output_type,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_integers(input, output_type, stream, mr);
@@ -353,7 +354,7 @@ struct dispatch_from_integers_fn {
             std::enable_if_t<cudf::is_integral_not_bool<IntegerType>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& integers,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     size_type strings_count = integers.size();
     auto column             = column_device_view::create(integers, stream);
@@ -376,7 +377,7 @@ struct dispatch_from_integers_fn {
   template <typename T, std::enable_if_t<not cudf::is_integral_not_bool<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("Values for from_integers function must be an integer type.");
   }
@@ -386,7 +387,7 @@ struct dispatch_from_integers_fn {
 // This will convert all integer column types into a strings column.
 std::unique_ptr<column> from_integers(column_view const& integers,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   size_type strings_count = integers.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -399,7 +400,7 @@ std::unique_ptr<column> from_integers(column_view const& integers,
 // external API
 std::unique_ptr<column> from_integers(column_view const& integers,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_integers(integers, stream, mr);
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index ce7f98067ef..3d259f0ab82 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -74,7 +75,7 @@ struct ipv4_to_integers_fn {
 // Convert strings column of IPv4 addresses to integers column
 std::unique_ptr<column> ipv4_to_integers(strings_column_view const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   size_type strings_count = input.size();
   if (strings_count == 0) {
@@ -106,7 +107,7 @@ std::unique_ptr<column> ipv4_to_integers(strings_column_view const& input,
 // external API
 std::unique_ptr<column> ipv4_to_integers(strings_column_view const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::ipv4_to_integers(input, stream, mr);
@@ -159,7 +160,7 @@ struct integers_to_ipv4_fn {
 // Convert integers into IPv4 addresses
 std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   if (integers.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -178,7 +179,7 @@ std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
 
 std::unique_ptr<column> is_ipv4(strings_column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_column       = *strings_column;
@@ -227,7 +228,7 @@ std::unique_ptr<column> is_ipv4(strings_column_view const& input,
 
 std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::integers_to_ipv4(integers, stream, mr);
@@ -235,7 +236,7 @@ std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
 
 std::unique_ptr<column> is_ipv4(strings_column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_ipv4(input, stream, mr);
diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu
index d6c24b6981b..ed898bd6f72 100644
--- a/cpp/src/strings/convert/convert_lists.cu
+++ b/cpp/src/strings/convert/convert_lists.cu
@@ -23,6 +23,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -193,7 +194,7 @@ std::unique_ptr<column> format_list_column(lists_column_view const& input,
                                            string_scalar const& na_rep,
                                            strings_column_view const& separators,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(data_type{type_id::STRING});
 
@@ -234,7 +235,7 @@ std::unique_ptr<column> format_list_column(lists_column_view const& input,
                                            string_scalar const& na_rep,
                                            strings_column_view const& separators,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::format_list_column(input, na_rep, separators, stream, mr);
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index f5aeeb8d130..644ffbb4bd1 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -31,6 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 
@@ -125,7 +126,7 @@ struct url_encoder_fn {
 //
 std::unique_ptr<column> url_encode(strings_column_view const& input,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -146,7 +147,7 @@ std::unique_ptr<column> url_encode(strings_column_view const& input,
 // external API
 std::unique_ptr<column> url_encode(strings_column_view const& input,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::url_encode(input, stream, mr);
@@ -369,7 +370,7 @@ CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings,
 //
 std::unique_ptr<column> url_decode(strings_column_view const& strings,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -416,7 +417,7 @@ std::unique_ptr<column> url_decode(strings_column_view const& strings,
 
 std::unique_ptr<column> url_decode(strings_column_view const& input,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::url_decode(input, stream, mr);
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index de7067f0bed..5daacbdc2fa 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -28,6 +28,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/advance.h>
 #include <thrust/binary_search.h>
@@ -203,7 +204,7 @@ CUDF_KERNEL void fused_concatenate_string_chars_kernel(column_device_view const*
 
 std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   // Compute output sizes
diff --git a/cpp/src/strings/copying/copy_range.cu b/cpp/src/strings/copying/copy_range.cu
index f4c86389534..9f8c47602f8 100644
--- a/cpp/src/strings/copying/copy_range.cu
+++ b/cpp/src/strings/copying/copy_range.cu
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -65,7 +66,7 @@ std::unique_ptr<column> copy_range(strings_column_view const& source,
                                    size_type source_end,
                                    size_type target_begin,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   auto target_end = target_begin + (source_end - source_begin);
   CUDF_EXPECTS(
diff --git a/cpp/src/strings/copying/copying.cu b/cpp/src/strings/copying/copying.cu
index 6f045fa7ea8..e8b411d50a6 100644
--- a/cpp/src/strings/copying/copying.cu
+++ b/cpp/src/strings/copying/copying.cu
@@ -25,6 +25,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/transform.h>
@@ -37,7 +38,7 @@ std::unique_ptr<cudf::column> copy_slice(strings_column_view const& input,
                                          size_type start,
                                          size_type end,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return make_empty_column(type_id::STRING); }
   CUDF_EXPECTS(((start >= 0) && (start < end)), "Invalid start parameter value.");
diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu
index 3a83cdab045..562ee6a7088 100644
--- a/cpp/src/strings/copying/shift.cu
+++ b/cpp/src/strings/copying/shift.cu
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -92,7 +93,7 @@ std::unique_ptr<column> shift(strings_column_view const& input,
                               size_type offset,
                               scalar const& fill_value,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   auto d_fill_str = static_cast<string_scalar const&>(fill_value).value(stream);
 
diff --git a/cpp/src/strings/count_matches.cu b/cpp/src/strings/count_matches.cu
index 8a32a46cc2b..e8672ea5335 100644
--- a/cpp/src/strings/count_matches.cu
+++ b/cpp/src/strings/count_matches.cu
@@ -21,6 +21,8 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/strings/string_view.cuh>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -60,7 +62,7 @@ std::unique_ptr<column> count_matches(column_device_view const& d_strings,
                                       reprog_device& d_prog,
                                       size_type output_size,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   assert(output_size >= d_strings.size() and "Unexpected output size");
 
diff --git a/cpp/src/strings/count_matches.hpp b/cpp/src/strings/count_matches.hpp
index a4f76c1c5e3..4a5efac37fd 100644
--- a/cpp/src/strings/count_matches.hpp
+++ b/cpp/src/strings/count_matches.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/column/column.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -45,7 +46,7 @@ std::unique_ptr<column> count_matches(column_device_view const& d_strings,
                                       reprog_device& d_prog,
                                       size_type output_size,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu
index ffd4e03ea87..b18b50d1b43 100644
--- a/cpp/src/strings/extract/extract.cu
+++ b/cpp/src/strings/extract/extract.cu
@@ -29,6 +29,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/execution_policy.h>
@@ -91,7 +92,7 @@ struct extract_fn {
 std::unique_ptr<table> extract(strings_column_view const& input,
                                regex_program const& prog,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   // create device object from regex_program
   auto d_prog = regex_device_builder::create_prog_device(prog, stream);
@@ -135,7 +136,7 @@ std::unique_ptr<table> extract(strings_column_view const& input,
 std::unique_ptr<table> extract(strings_column_view const& input,
                                regex_program const& prog,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract(input, prog, stream, mr);
diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu
index 3a02acb7050..27691068d5a 100644
--- a/cpp/src/strings/extract/extract_all.cu
+++ b/cpp/src/strings/extract/extract_all.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/transform_scan.h>
@@ -104,7 +105,7 @@ struct extract_fn {
 std::unique_ptr<column> extract_all_record(strings_column_view const& input,
                                            regex_program const& prog,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto const strings_count = input.size();
   auto const d_strings     = column_device_view::create(input.parent(), stream);
@@ -164,7 +165,7 @@ std::unique_ptr<column> extract_all_record(strings_column_view const& input,
 std::unique_ptr<column> extract_all_record(strings_column_view const& input,
                                            regex_program const& prog,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_all_record(input, prog, stream, mr);
diff --git a/cpp/src/strings/filling/fill.cu b/cpp/src/strings/filling/fill.cu
index 685c3eec744..b48d56a595c 100644
--- a/cpp/src/strings/filling/fill.cu
+++ b/cpp/src/strings/filling/fill.cu
@@ -24,6 +24,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -62,7 +63,7 @@ std::unique_ptr<column> fill(strings_column_view const& input,
                              size_type end,
                              string_scalar const& value,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   auto const strings_count = input.size();
   if (strings_count == 0) { return make_empty_column(type_id::STRING); }
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index aaaa751c3f9..32717dac78d 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -31,6 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/find.h>
@@ -118,7 +119,7 @@ std::unique_ptr<column> filter_characters(
   filter_type keep_characters,
   string_scalar const& replacement,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -160,7 +161,7 @@ std::unique_ptr<column> filter_characters(
   filter_type keep_characters,
   string_scalar const& replacement,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::filter_characters(
diff --git a/cpp/src/strings/like.cu b/cpp/src/strings/like.cu
index 93e00592ef2..4df1b9b4ffe 100644
--- a/cpp/src/strings/like.cu
+++ b/cpp/src/strings/like.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -123,7 +124,7 @@ std::unique_ptr<column> like(strings_column_view const& input,
                              PatternIterator const patterns_itr,
                              string_view const& d_escape,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   auto results = make_numeric_column(data_type{type_id::BOOL8},
                                      input.size(),
@@ -151,7 +152,7 @@ std::unique_ptr<column> like(strings_column_view const& input,
                              string_scalar const& pattern,
                              string_scalar const& escape_character,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(pattern.is_valid(stream), "Parameter pattern must be valid");
   CUDF_EXPECTS(escape_character.is_valid(stream), "Parameter escape_character must be valid");
@@ -166,7 +167,7 @@ std::unique_ptr<column> like(strings_column_view const& input,
                              strings_column_view const& patterns,
                              string_scalar const& escape_character,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(patterns.size() == input.size(), "Number of patterns must match the input size");
   CUDF_EXPECTS(patterns.has_nulls() == false, "Parameter patterns must not contain nulls");
@@ -186,7 +187,7 @@ std::unique_ptr<column> like(strings_column_view const& input,
                              string_scalar const& pattern,
                              string_scalar const& escape_character,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::like(input, pattern, escape_character, stream, mr);
@@ -196,7 +197,7 @@ std::unique_ptr<column> like(strings_column_view const& input,
                              strings_column_view const& patterns,
                              string_scalar const& escape_character,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::like(input, patterns, escape_character, stream, mr);
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index 85d47af87f6..d8a3055772e 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -103,7 +104,7 @@ std::unique_ptr<column> pad(strings_column_view const& input,
                             side_type side,
                             std::string_view fill_char,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
   CUDF_EXPECTS(!fill_char.empty(), "fill_char parameter must not be empty");
@@ -146,7 +147,7 @@ struct zfill_fn : base_fn<zfill_fn> {
 std::unique_ptr<column> zfill(strings_column_view const& input,
                               size_type width,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -170,7 +171,7 @@ std::unique_ptr<column> pad(strings_column_view const& input,
                             side_type side,
                             std::string_view fill_char,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::pad(input, width, side, fill_char, stream, mr);
@@ -179,7 +180,7 @@ std::unique_ptr<column> pad(strings_column_view const& input,
 std::unique_ptr<column> zfill(strings_column_view const& input,
                               size_type width,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::zfill(input, width, stream, mr);
diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh
index ae8211ac916..cfe53937e66 100644
--- a/cpp/src/strings/regex/utilities.cuh
+++ b/cpp/src/strings/regex/utilities.cuh
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/scan.h>
 
@@ -113,7 +114,7 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
                            reprog_device& d_prog,
                            size_type strings_count,
                            rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
+                           rmm::device_async_resource_ref mr)
 {
   auto offsets = make_numeric_column(
     data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index 690a72c098f..97168a7fbd7 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -28,6 +28,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
@@ -42,7 +43,7 @@ namespace detail {
 std::unique_ptr<string_scalar> repeat_string(string_scalar const& input,
                                              size_type repeat_times,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   if (!input.is_valid(stream)) { return std::make_unique<string_scalar>("", false, stream, mr); }
   if (input.size() == 0 || repeat_times <= 0) {
@@ -79,7 +80,7 @@ namespace {
 auto generate_empty_output(strings_column_view const& input,
                            size_type strings_count,
                            rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
+                           rmm::device_async_resource_ref mr)
 {
   auto offsets_column = make_numeric_column(
     data_type{type_to_id<size_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
@@ -143,7 +144,7 @@ struct compute_size_and_repeat_fn {
 std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                        size_type repeat_times,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto const strings_count = input.size();
   if (strings_count == 0) { return make_empty_column(type_id::STRING); }
@@ -220,7 +221,7 @@ struct compute_sizes_and_repeat_fn {
 std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                        column_view const& repeat_times,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.size() == repeat_times.size(), "The input columns must have the same size.");
   CUDF_EXPECTS(cudf::is_index_type(repeat_times.type()),
@@ -256,7 +257,7 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
 std::unique_ptr<string_scalar> repeat_string(string_scalar const& input,
                                              size_type repeat_times,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::repeat_string(input, repeat_times, stream, mr);
@@ -265,7 +266,7 @@ std::unique_ptr<string_scalar> repeat_string(string_scalar const& input,
 std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                        size_type repeat_times,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::repeat_strings(input, repeat_times, stream, mr);
@@ -274,7 +275,7 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
 std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                        column_view const& repeat_times,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::repeat_strings(input, repeat_times, stream, mr);
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index 8e20db18f43..86afe4c8b9b 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -30,6 +30,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <regex>
 
@@ -105,7 +106,7 @@ std::unique_ptr<column> replace_with_backrefs(strings_column_view const& input,
                                               regex_program const& prog,
                                               std::string_view replacement,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -148,7 +149,7 @@ std::unique_ptr<column> replace_with_backrefs(strings_column_view const& strings
                                               regex_program const& prog,
                                               std::string_view replacement,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_with_backrefs(strings, prog, replacement, stream, mr);
diff --git a/cpp/src/strings/replace/find_replace.cu b/cpp/src/strings/replace/find_replace.cu
index 818bfa58427..79bf6e3c910 100644
--- a/cpp/src/strings/replace/find_replace.cu
+++ b/cpp/src/strings/replace/find_replace.cu
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/transform.h>
@@ -65,7 +66,7 @@ std::unique_ptr<cudf::column> find_and_replace_all(
   cudf::strings_column_view const& values_to_replace,
   cudf::strings_column_view const& replacement_values,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto d_input             = cudf::column_device_view::create(input.parent(), stream);
   auto d_values_to_replace = cudf::column_device_view::create(values_to_replace.parent(), stream);
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index c93add01f69..2eb03bd10a4 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -34,6 +34,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -284,7 +285,7 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
                                                    strings_column_view const& targets,
                                                    strings_column_view const& repls,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   auto d_strings = column_device_view::create(input.parent(), stream);
 
@@ -452,7 +453,7 @@ std::unique_ptr<column> replace_string_parallel(strings_column_view const& input
                                                 strings_column_view const& targets,
                                                 strings_column_view const& repls,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   auto d_strings      = column_device_view::create(input.parent(), stream);
   auto d_targets      = column_device_view::create(targets.parent(), stream);
@@ -474,7 +475,7 @@ std::unique_ptr<column> replace(strings_column_view const& input,
                                 strings_column_view const& targets,
                                 strings_column_view const& repls,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return make_empty_column(type_id::STRING); }
   CUDF_EXPECTS(((targets.size() > 0) && (targets.null_count() == 0)),
@@ -499,7 +500,7 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
                                 strings_column_view const& targets,
                                 strings_column_view const& repls,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace(strings, targets, repls, stream, mr);
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 743e5894112..5172dba3fc3 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -31,6 +31,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
@@ -140,7 +141,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
                                    strings_column_view const& replacements,
                                    regex_flags const flags,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return make_empty_column(type_id::STRING); }
   if (patterns.empty()) {  // if no patterns; just return a copy
@@ -207,7 +208,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& strings,
                                    strings_column_view const& replacements,
                                    regex_flags const flags,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_re(strings, patterns, replacements, flags, stream, mr);
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index 2c548f2f7cd..857bc7fb41c 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -32,6 +32,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -242,7 +243,7 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
                                                    string_view const& d_replacement,
                                                    cudf::size_type maxrepl,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   auto d_strings = column_device_view::create(input.parent(), stream);
 
@@ -393,7 +394,7 @@ std::unique_ptr<column> replace_string_parallel(strings_column_view const& input
                                                 string_view const& d_replacement,
                                                 cudf::size_type maxrepl,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   auto d_strings = column_device_view::create(input.parent(), stream);
 
@@ -414,7 +415,7 @@ std::unique_ptr<column> replace(strings_column_view const& input,
                                 string_scalar const& repl,
                                 cudf::size_type maxrepl,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return make_empty_column(type_id::STRING); }
   if (maxrepl == 0) { return std::make_unique<cudf::column>(input.parent(), stream, mr); }
@@ -441,7 +442,7 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
                                 string_scalar const& repl,
                                 cudf::size_type maxrepl,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace(strings, target, repl, maxrepl, stream, mr);
diff --git a/cpp/src/strings/replace/replace_nulls.cu b/cpp/src/strings/replace/replace_nulls.cu
index bbca4997f57..ffd9e6c2553 100644
--- a/cpp/src/strings/replace/replace_nulls.cu
+++ b/cpp/src/strings/replace/replace_nulls.cu
@@ -28,6 +28,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -39,7 +40,7 @@ namespace detail {
 std::unique_ptr<column> replace_nulls(strings_column_view const& input,
                                       string_scalar const& repl,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   size_type strings_count = input.size();
   if (strings_count == 0) { return make_empty_column(type_id::STRING); }
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index bded196946f..1290302340b 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -29,6 +29,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -102,7 +103,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
                                    string_scalar const& replacement,
                                    std::optional<size_type> max_replace_count,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -135,7 +136,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& strings,
                                    string_scalar const& replacement,
                                    std::optional<size_type> max_replace_count,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_re(strings, prog, replacement, max_replace_count, stream, mr);
diff --git a/cpp/src/strings/replace/replace_slice.cu b/cpp/src/strings/replace/replace_slice.cu
index c11664c86d4..90540b39189 100644
--- a/cpp/src/strings/replace/replace_slice.cu
+++ b/cpp/src/strings/replace/replace_slice.cu
@@ -27,6 +27,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 
@@ -80,7 +81,7 @@ std::unique_ptr<column> replace_slice(strings_column_view const& input,
                                       size_type start,
                                       size_type stop,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return make_empty_column(type_id::STRING); }
   CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid.");
@@ -110,7 +111,7 @@ std::unique_ptr<column> replace_slice(strings_column_view const& input,
                                       size_type start,
                                       size_type stop,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_slice(input, repl, start, stop, stream, mr);
diff --git a/cpp/src/strings/reverse.cu b/cpp/src/strings/reverse.cu
index f9aec41b5e3..cbd231bc5f3 100644
--- a/cpp/src/strings/reverse.cu
+++ b/cpp/src/strings/reverse.cu
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -58,7 +59,7 @@ struct reverse_characters_fn {
 
 std::unique_ptr<column> reverse(strings_column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return make_empty_column(type_id::STRING); }
 
@@ -81,7 +82,7 @@ std::unique_ptr<column> reverse(strings_column_view const& input,
 
 std::unique_ptr<column> reverse(strings_column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::reverse(input, stream, mr);
diff --git a/cpp/src/strings/scan/scan_inclusive.cu b/cpp/src/strings/scan/scan_inclusive.cu
index 0cf492fa295..b3e45f65a21 100644
--- a/cpp/src/strings/scan/scan_inclusive.cu
+++ b/cpp/src/strings/scan/scan_inclusive.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/scan.h>
@@ -82,7 +83,7 @@ template <typename Op>
 std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        bitmask_type const* mask,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto d_input = column_device_view::create(input, stream);
 
@@ -120,12 +121,12 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
 template std::unique_ptr<column> scan_inclusive<DeviceMin>(column_view const& input,
                                                            bitmask_type const* mask,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr);
+                                                           rmm::device_async_resource_ref mr);
 
 template std::unique_ptr<column> scan_inclusive<DeviceMax>(column_view const& input,
                                                            bitmask_type const* mask,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr);
+                                                           rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index 598d48157d9..bbd98c4e9ff 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <thrust/binary_search.h>
@@ -208,7 +209,7 @@ std::unique_ptr<column> find_fn(strings_column_view const& input,
                                 size_type start,
                                 size_type stop,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(target.is_valid(stream), "Parameter target must be valid.");
   CUDF_EXPECTS(start >= 0, "Parameter start must be positive integer or zero.");
@@ -252,7 +253,7 @@ std::unique_ptr<column> find(strings_column_view const& input,
                              size_type start,
                              size_type stop,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   return find_fn<true>(input, target, start, stop, stream, mr);
 }
@@ -262,7 +263,7 @@ std::unique_ptr<column> rfind(strings_column_view const& input,
                               size_type start,
                               size_type stop,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   return find_fn<false>(input, target, start, stop, stream, mr);
 }
@@ -272,7 +273,7 @@ std::unique_ptr<column> find(strings_column_view const& input,
                              strings_column_view const& target,
                              size_type start,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(start >= 0, "Parameter start must be positive integer or zero.");
   CUDF_EXPECTS(input.size() == target.size(), "input and target columns must be the same size");
@@ -305,7 +306,7 @@ std::unique_ptr<column> find(strings_column_view const& strings,
                              size_type start,
                              size_type stop,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::find(strings, target, start, stop, stream, mr);
@@ -316,7 +317,7 @@ std::unique_ptr<column> rfind(strings_column_view const& strings,
                               size_type start,
                               size_type stop,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rfind(strings, target, start, stop, stream, mr);
@@ -326,7 +327,7 @@ std::unique_ptr<column> find(strings_column_view const& input,
                              strings_column_view const& target,
                              size_type start,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::find<true>(input, target, start, stream, mr);
@@ -375,7 +376,7 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
 std::unique_ptr<column> contains_warp_parallel(strings_column_view const& input,
                                                string_scalar const& target,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(target.is_valid(stream), "Parameter target must be valid.");
   auto d_target = string_view(target.data(), target.size());
@@ -427,7 +428,7 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
                                     string_scalar const& target,
                                     BoolFunction pfn,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::BOOL8);
@@ -488,7 +489,7 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
                                     strings_column_view const& targets,
                                     BoolFunction pfn,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   if (strings.is_empty()) return make_empty_column(type_id::BOOL8);
 
@@ -533,7 +534,7 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
 std::unique_ptr<column> contains(strings_column_view const& input,
                                  string_scalar const& target,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   // use warp parallel when the average string width is greater than the threshold
   if ((input.null_count() < input.size()) &&
@@ -551,7 +552,7 @@ std::unique_ptr<column> contains(strings_column_view const& input,
 std::unique_ptr<column> contains(strings_column_view const& strings,
                                  strings_column_view const& targets,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     return d_string.find(d_target) != string_view::npos;
@@ -562,7 +563,7 @@ std::unique_ptr<column> contains(strings_column_view const& strings,
 std::unique_ptr<column> starts_with(strings_column_view const& strings,
                                     string_scalar const& target,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     return (d_target.size_bytes() <= d_string.size_bytes()) &&
@@ -574,7 +575,7 @@ std::unique_ptr<column> starts_with(strings_column_view const& strings,
 std::unique_ptr<column> starts_with(strings_column_view const& strings,
                                     strings_column_view const& targets,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     return (d_target.size_bytes() <= d_string.size_bytes()) &&
@@ -586,7 +587,7 @@ std::unique_ptr<column> starts_with(strings_column_view const& strings,
 std::unique_ptr<column> ends_with(strings_column_view const& strings,
                                   string_scalar const& target,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     auto const str_size = d_string.size_bytes();
@@ -601,7 +602,7 @@ std::unique_ptr<column> ends_with(strings_column_view const& strings,
 std::unique_ptr<column> ends_with(strings_column_view const& strings,
                                   strings_column_view const& targets,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     auto const str_size = d_string.size_bytes();
@@ -620,7 +621,7 @@ std::unique_ptr<column> ends_with(strings_column_view const& strings,
 std::unique_ptr<column> contains(strings_column_view const& strings,
                                  string_scalar const& target,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::contains(strings, target, stream, mr);
@@ -629,7 +630,7 @@ std::unique_ptr<column> contains(strings_column_view const& strings,
 std::unique_ptr<column> contains(strings_column_view const& strings,
                                  strings_column_view const& targets,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::contains(strings, targets, stream, mr);
@@ -638,7 +639,7 @@ std::unique_ptr<column> contains(strings_column_view const& strings,
 std::unique_ptr<column> starts_with(strings_column_view const& strings,
                                     string_scalar const& target,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::starts_with(strings, target, stream, mr);
@@ -647,7 +648,7 @@ std::unique_ptr<column> starts_with(strings_column_view const& strings,
 std::unique_ptr<column> starts_with(strings_column_view const& strings,
                                     strings_column_view const& targets,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::starts_with(strings, targets, stream, mr);
@@ -656,7 +657,7 @@ std::unique_ptr<column> starts_with(strings_column_view const& strings,
 std::unique_ptr<column> ends_with(strings_column_view const& strings,
                                   string_scalar const& target,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::ends_with(strings, target, stream, mr);
@@ -665,7 +666,7 @@ std::unique_ptr<column> ends_with(strings_column_view const& strings,
 std::unique_ptr<column> ends_with(strings_column_view const& strings,
                                   strings_column_view const& targets,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::ends_with(strings, targets, stream, mr);
diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu
index fcaec835f4d..223a941a88a 100644
--- a/cpp/src/strings/search/find_multiple.cu
+++ b/cpp/src/strings/search/find_multiple.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -37,7 +38,7 @@ namespace detail {
 std::unique_ptr<column> find_multiple(strings_column_view const& input,
                                       strings_column_view const& targets,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   auto const strings_count = input.size();
   auto const targets_count = targets.size();
@@ -89,7 +90,7 @@ std::unique_ptr<column> find_multiple(strings_column_view const& input,
 std::unique_ptr<column> find_multiple(strings_column_view const& input,
                                       strings_column_view const& targets,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::find_multiple(input, targets, stream, mr);
diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu
index 4e8e3a6a449..0d0962258cf 100644
--- a/cpp/src/strings/search/findall.cu
+++ b/cpp/src/strings/search/findall.cu
@@ -31,6 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/pair.h>
 
@@ -78,7 +79,7 @@ std::unique_ptr<column> findall_util(column_device_view const& d_strings,
                                      int64_t total_matches,
                                      size_type const* d_offsets,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<string_index_pair> indices(total_matches, stream);
 
@@ -94,7 +95,7 @@ std::unique_ptr<column> findall_util(column_device_view const& d_strings,
 std::unique_ptr<column> findall(strings_column_view const& input,
                                 regex_program const& prog,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   auto const strings_count = input.size();
   auto const d_strings     = column_device_view::create(input.parent(), stream);
@@ -128,7 +129,7 @@ std::unique_ptr<column> findall(strings_column_view const& input,
 std::unique_ptr<column> findall(strings_column_view const& input,
                                 regex_program const& prog,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::findall(input, prog, stream, mr);
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index 98f3c9cae0d..d080065b330 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -28,6 +28,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -156,7 +157,7 @@ std::unique_ptr<column> compute_substrings_from_fn(column_device_view const& d_c
                                                    IndexIterator starts,
                                                    IndexIterator stops,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   auto results = rmm::device_uvector<string_view>(d_column.size(), stream);
   thrust::transform(rmm::exec_policy(stream),
@@ -175,7 +176,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       numeric_scalar<size_type> const& stop,
                                       numeric_scalar<size_type> const& step,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   if (strings.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -218,7 +219,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       column_view const& starts_column,
                                       column_view const& stops_column,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -249,7 +250,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       numeric_scalar<size_type> const& stop,
                                       numeric_scalar<size_type> const& step,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::slice_strings(strings, start, stop, step, stream, mr);
@@ -259,7 +260,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       column_view const& starts_column,
                                       column_view const& stops_column,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::slice_strings(strings, starts_column, stops_column, stream, mr);
diff --git a/cpp/src/strings/split/partition.cu b/cpp/src/strings/split/partition.cu
index 16e6402cfef..93d55c494fe 100644
--- a/cpp/src/strings/split/partition.cu
+++ b/cpp/src/strings/split/partition.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -184,7 +185,7 @@ struct rpartition_fn : public partition_fn {
 std::unique_ptr<table> partition(strings_column_view const& strings,
                                  string_scalar const& delimiter,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   auto strings_count = strings.size();
@@ -211,7 +212,7 @@ std::unique_ptr<table> partition(strings_column_view const& strings,
 std::unique_ptr<table> rpartition(strings_column_view const& strings,
                                   string_scalar const& delimiter,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   auto strings_count = strings.size();
@@ -242,7 +243,7 @@ std::unique_ptr<table> rpartition(strings_column_view const& strings,
 std::unique_ptr<table> partition(strings_column_view const& input,
                                  string_scalar const& delimiter,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::partition(input, delimiter, stream, mr);
@@ -251,7 +252,7 @@ std::unique_ptr<table> partition(strings_column_view const& input,
 std::unique_ptr<table> rpartition(strings_column_view const& input,
                                   string_scalar const& delimiter,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rpartition(input, delimiter, stream, mr);
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index 1416b293b75..2c6a0b2cf22 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -31,6 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/fill.h>
@@ -111,7 +112,7 @@ template <typename Tokenizer>
 std::unique_ptr<table> split_fn(strings_column_view const& input,
                                 Tokenizer tokenizer,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   std::vector<std::unique_ptr<column>> results;
   if (input.size() == input.null_count()) {
@@ -329,7 +330,7 @@ template <typename Tokenizer>
 std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
                                            Tokenizer tokenizer,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   // compute the number of tokens per string
   rmm::device_uvector<size_type> token_counts(strings_count, stream);
@@ -386,7 +387,7 @@ std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
 std::unique_ptr<column> create_offsets_from_positions(strings_column_view const& input,
                                                       device_span<int64_t const> const& positions,
                                                       rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr)
+                                                      rmm::device_async_resource_ref mr)
 {
   auto const d_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
@@ -427,7 +428,7 @@ std::unique_ptr<table> split(strings_column_view const& strings_column,
                              string_scalar const& delimiter,
                              size_type maxsplit,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
@@ -450,7 +451,7 @@ std::unique_ptr<table> rsplit(strings_column_view const& strings_column,
                               string_scalar const& delimiter,
                               size_type maxsplit,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
@@ -477,7 +478,7 @@ std::unique_ptr<table> split(strings_column_view const& strings_column,
                              string_scalar const& delimiter,
                              size_type maxsplit,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::split(strings_column, delimiter, maxsplit, stream, mr);
@@ -487,7 +488,7 @@ std::unique_ptr<table> rsplit(strings_column_view const& strings_column,
                               string_scalar const& delimiter,
                               size_type maxsplit,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rsplit(strings_column, delimiter, maxsplit, stream, mr);
diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index 5f3c9372c39..160d1be3978 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -27,6 +27,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <thrust/binary_search.h>
@@ -294,7 +295,7 @@ struct rsplit_tokenizer_fn : base_split_tokenizer<rsplit_tokenizer_fn> {
 std::unique_ptr<column> create_offsets_from_positions(strings_column_view const& input,
                                                       device_span<int64_t const> const& positions,
                                                       rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr);
+                                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Helper function used by split/rsplit and split_record/rsplit_record
@@ -315,7 +316,7 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
   strings_column_view const& input,
   Tokenizer tokenizer,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto const strings_count = input.size();
   auto const chars_bytes =
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 16725fe006a..4dfb3e9ea62 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -29,6 +29,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/functional.h>
@@ -187,7 +188,7 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
                                 split_direction direction,
                                 size_type maxsplit,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!prog.pattern().empty(), "Parameter pattern must not be empty");
 
@@ -258,7 +259,7 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
                                         split_direction direction,
                                         size_type maxsplit,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!prog.pattern().empty(), "Parameter pattern must not be empty");
 
@@ -298,7 +299,7 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
                                 regex_program const& prog,
                                 size_type maxsplit,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   return split_re(input, prog, split_direction::FORWARD, maxsplit, stream, mr);
 }
@@ -307,7 +308,7 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
                                         regex_program const& prog,
                                         size_type maxsplit,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   return split_record_re(input, prog, split_direction::FORWARD, maxsplit, stream, mr);
 }
@@ -316,7 +317,7 @@ std::unique_ptr<table> rsplit_re(strings_column_view const& input,
                                  regex_program const& prog,
                                  size_type maxsplit,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   return split_re(input, prog, split_direction::BACKWARD, maxsplit, stream, mr);
 }
@@ -325,7 +326,7 @@ std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
                                          regex_program const& prog,
                                          size_type maxsplit,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   return split_record_re(input, prog, split_direction::BACKWARD, maxsplit, stream, mr);
 }
@@ -338,7 +339,7 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
                                 regex_program const& prog,
                                 size_type maxsplit,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::split_re(input, prog, maxsplit, stream, mr);
@@ -348,7 +349,7 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
                                         regex_program const& prog,
                                         size_type maxsplit,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::split_record_re(input, prog, maxsplit, stream, mr);
@@ -358,7 +359,7 @@ std::unique_ptr<table> rsplit_re(strings_column_view const& input,
                                  regex_program const& prog,
                                  size_type maxsplit,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rsplit_re(input, prog, maxsplit, stream, mr);
@@ -368,7 +369,7 @@ std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
                                          regex_program const& prog,
                                          size_type maxsplit,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rsplit_record_re(input, prog, maxsplit, stream, mr);
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index 0971069592e..3e8be750b9e 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -29,6 +29,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -46,7 +47,7 @@ template <typename Tokenizer>
 std::unique_ptr<column> split_record_fn(strings_column_view const& input,
                                         Tokenizer tokenizer,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) {
     return cudf::lists::detail::make_empty_lists_column(data_type{type_id::STRING}, stream, mr);
@@ -142,7 +143,7 @@ template <typename TokenReader>
 std::unique_ptr<column> whitespace_split_record_fn(strings_column_view const& input,
                                                    TokenReader reader,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   // create offsets column by counting the number of tokens per string
   auto sizes_itr = cudf::detail::make_counting_transform_iterator(
@@ -176,7 +177,7 @@ std::unique_ptr<column> split_record(strings_column_view const& strings,
                                      string_scalar const& delimiter,
                                      size_type maxsplit,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
@@ -210,7 +211,7 @@ std::unique_ptr<column> split_record(strings_column_view const& strings,
                                      string_scalar const& delimiter,
                                      size_type maxsplit,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::split_record<detail::Direction::FORWARD>(strings, delimiter, maxsplit, stream, mr);
@@ -220,7 +221,7 @@ std::unique_ptr<column> rsplit_record(strings_column_view const& strings,
                                       string_scalar const& delimiter,
                                       size_type maxsplit,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::split_record<detail::Direction::BACKWARD>(
diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu
index 0f1b9e3baae..a298285f841 100644
--- a/cpp/src/strings/strings_column_factories.cu
+++ b/cpp/src/strings/strings_column_factories.cu
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/pair.h>
@@ -49,7 +50,7 @@ struct string_view_to_pair {
 std::unique_ptr<column> make_strings_column(
   device_span<thrust::pair<char const*, size_type> const> strings,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -59,7 +60,7 @@ std::unique_ptr<column> make_strings_column(
 std::unique_ptr<column> make_strings_column(device_span<string_view const> string_views,
                                             string_view null_placeholder,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/strings/strings_scalar_factories.cpp b/cpp/src/strings/strings_scalar_factories.cpp
index 9c7f905cb0b..233fee14694 100644
--- a/cpp/src/strings/strings_scalar_factories.cpp
+++ b/cpp/src/strings/strings_scalar_factories.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,12 +17,13 @@
 #include <cudf/scalar/scalar.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 // Create a strings-type column from array of pointer/size pairs
 std::unique_ptr<scalar> make_string_scalar(std::string const& string,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto s = new string_scalar(string, true, stream, mr);
   return std::unique_ptr<scalar>(s);
diff --git a/cpp/src/strings/strip.cu b/cpp/src/strings/strip.cu
index 26df76850f7..639097abe63 100644
--- a/cpp/src/strings/strip.cu
+++ b/cpp/src/strings/strip.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -60,7 +61,7 @@ std::unique_ptr<column> strip(strings_column_view const& input,
                               side_type side,
                               string_scalar const& to_strip,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -87,7 +88,7 @@ std::unique_ptr<column> strip(strings_column_view const& input,
                               side_type side,
                               string_scalar const& to_strip,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::strip(input, side, to_strip, stream, mr);
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index a8603f47226..fcf55429e09 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -28,6 +28,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
@@ -89,7 +90,7 @@ struct translate_fn {
 std::unique_ptr<column> translate(strings_column_view const& strings,
                                   std::vector<std::pair<char_utf8, char_utf8>> const& chars_table,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   if (strings.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -127,7 +128,7 @@ std::unique_ptr<column> translate(strings_column_view const& strings,
 std::unique_ptr<column> translate(strings_column_view const& input,
                                   std::vector<std::pair<uint32_t, uint32_t>> const& chars_table,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::translate(input, chars_table, stream, mr);
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index c83f827f290..18e726a6d7d 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -28,6 +28,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -45,7 +46,7 @@ namespace detail {
 rmm::device_uvector<string_view> create_string_vector_from_column(
   cudf::strings_column_view const input,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto d_strings = column_device_view::create(input.parent(), stream);
 
@@ -75,7 +76,7 @@ rmm::device_uvector<string_view> create_string_vector_from_column(
 std::unique_ptr<column> create_offsets_child_column(int64_t chars_bytes,
                                                     size_type count,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   auto const threshold = get_offset64_threshold();
   if (!is_large_strings_enabled()) {
diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu
index 0b3b6e78f82..dff1891c3cc 100644
--- a/cpp/src/strings/wrap.cu
+++ b/cpp/src/strings/wrap.cu
@@ -29,6 +29,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -95,7 +96,7 @@ template <typename device_execute_functor>
 std::unique_ptr<column> wrap(strings_column_view const& strings,
                              size_type width,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(width > 0, "Positive wrap width required");
 
@@ -139,7 +140,7 @@ std::unique_ptr<column> wrap(strings_column_view const& strings,
 std::unique_ptr<column> wrap(strings_column_view const& strings,
                              size_type width,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::wrap<detail::execute_wrap>(strings, width, stream, mr);
diff --git a/cpp/src/structs/copying/concatenate.cu b/cpp/src/structs/copying/concatenate.cu
index e010ad9dd41..2ccf071711a 100644
--- a/cpp/src/structs/copying/concatenate.cu
+++ b/cpp/src/structs/copying/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <cudf/structs/structs_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <algorithm>
 #include <memory>
@@ -39,7 +40,7 @@ namespace detail {
  */
 std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   // get ordered children
   auto ordered_children = extract_ordered_struct_children(columns, stream);
diff --git a/cpp/src/structs/scan/scan_inclusive.cu b/cpp/src/structs/scan/scan_inclusive.cu
index 410a7d9348e..a6ccea5fca1 100644
--- a/cpp/src/structs/scan/scan_inclusive.cu
+++ b/cpp/src/structs/scan/scan_inclusive.cu
@@ -24,6 +24,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/scan.h>
@@ -40,7 +41,7 @@ namespace {
 template <typename Op>
 std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   // Create a gather map containing indices of the prefix min/max elements.
   auto gather_map = rmm::device_uvector<size_type>(input.size(), stream);
@@ -78,11 +79,11 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
 
 template std::unique_ptr<column> scan_inclusive<DeviceMin>(column_view const& input_view,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr);
+                                                           rmm::device_async_resource_ref mr);
 
 template std::unique_ptr<column> scan_inclusive<DeviceMax>(column_view const& input_view,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr);
+                                                           rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace structs
diff --git a/cpp/src/structs/structs_column_factories.cu b/cpp/src/structs/structs_column_factories.cu
index d94a33ce9fb..bbe2bb96fde 100644
--- a/cpp/src/structs/structs_column_factories.cu
+++ b/cpp/src/structs/structs_column_factories.cu
@@ -19,6 +19,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -33,7 +34,7 @@ std::unique_ptr<cudf::column> make_structs_column(
   size_type null_count,
   rmm::device_buffer&& null_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(null_count <= 0 || !null_mask.is_empty(),
                "Struct column with nulls must be nullable.");
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index f47d066852c..81806c92e23 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -28,6 +28,8 @@
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
@@ -92,7 +94,7 @@ struct table_flattener {
   std::vector<null_order> const& null_precedence;
   column_nullability nullability;
   rmm::cuda_stream_view stream;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 
   temporary_nullable_data nullable_data;
   std::vector<std::unique_ptr<column>> validity_as_column;
@@ -105,7 +107,7 @@ struct table_flattener {
                   std::vector<null_order> const& null_precedence,
                   column_nullability nullability,
                   rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr)
+                  rmm::device_async_resource_ref mr)
     : column_order{column_order},
       null_precedence{null_precedence},
       nullability{nullability},
@@ -202,7 +204,7 @@ std::unique_ptr<flattened_table> flatten_nested_columns(
   std::vector<null_order> const& null_precedence,
   column_nullability nullability,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto const has_struct = std::any_of(input.begin(), input.end(), is_struct);
   if (not has_struct) {
@@ -228,7 +230,7 @@ std::unique_ptr<column> superimpose_nulls_no_sanitize(bitmask_type const* null_m
                                                       size_type null_count,
                                                       std::unique_ptr<column>&& input,
                                                       rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr)
+                                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   if (input->type().id() == cudf::type_id::EMPTY) {
@@ -280,7 +282,7 @@ std::unique_ptr<column> superimpose_nulls_no_sanitize(bitmask_type const* null_m
  * @copydoc cudf::structs::detail::push_down_nulls
  */
 std::pair<column_view, temporary_nullable_data> push_down_nulls_no_sanitize(
-  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   auto ret_nullable_data = temporary_nullable_data{};
   if (input.type().id() != type_id::STRUCT) {
@@ -371,7 +373,7 @@ std::unique_ptr<column> superimpose_nulls(bitmask_type const* null_mask,
                                           size_type null_count,
                                           std::unique_ptr<column>&& input,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   input = superimpose_nulls_no_sanitize(null_mask, null_count, std::move(input), stream, mr);
 
@@ -389,7 +391,7 @@ std::unique_ptr<column> superimpose_nulls(bitmask_type const* null_mask,
 
 std::pair<column_view, temporary_nullable_data> push_down_nulls(column_view const& input,
                                                                 rmm::cuda_stream_view stream,
-                                                                rmm::mr::device_memory_resource* mr)
+                                                                rmm::device_async_resource_ref mr)
 {
   auto output = push_down_nulls_no_sanitize(input, stream, mr);
 
@@ -410,7 +412,7 @@ std::pair<column_view, temporary_nullable_data> push_down_nulls(column_view cons
 
 std::pair<table_view, temporary_nullable_data> push_down_nulls(table_view const& table,
                                                                rmm::cuda_stream_view stream,
-                                                               rmm::mr::device_memory_resource* mr)
+                                                               rmm::device_async_resource_ref mr)
 {
   auto processed_columns = std::vector<column_view>{};
   auto nullable_data     = temporary_nullable_data{};
diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu
index 71b437cb47d..13c31e8ae4c 100644
--- a/cpp/src/table/row_operators.cu
+++ b/cpp/src/table/row_operators.cu
@@ -31,6 +31,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -414,7 +415,7 @@ auto replace_child(column_view const& input,
                    column_view const& new_child,
                    std::vector<std::unique_ptr<column>>& out_cols,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
+                   rmm::device_async_resource_ref mr)
 {
   auto const make_output = [&input](auto const& offsets_cv, auto const& child_cv) {
     return column_view{data_type{type_id::LIST},
@@ -463,7 +464,7 @@ auto replace_child(column_view const& input,
 auto compute_ranks(column_view const& input,
                    null_order column_null_order,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
+                   rmm::device_async_resource_ref mr)
 {
   return cudf::detail::rank(input,
                             rank_method::DENSE,
@@ -496,7 +497,7 @@ std::pair<column_view, std::vector<std::unique_ptr<column>>> transform_lists_of_
   column_view const& input,
   null_order column_null_order,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   std::vector<std::unique_ptr<column>> out_cols;
 
@@ -563,7 +564,7 @@ transform_lists_of_structs(column_view const& lhs,
                            column_view const& rhs,
                            null_order column_null_order,
                            rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
+                           rmm::device_async_resource_ref mr)
 {
   std::vector<std::unique_ptr<column>> out_cols_lhs;
   std::vector<std::unique_ptr<column>> out_cols_rhs;
diff --git a/cpp/src/table/table.cpp b/cpp/src/table/table.cpp
index 7e9ed4270c7..9dac7be5efe 100644
--- a/cpp/src/table/table.cpp
+++ b/cpp/src/table/table.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,11 +20,12 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
 // Copy the columns from another table
-table::table(table const& other, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+table::table(table const& other, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   : _num_rows{other.num_rows()}
 {
   CUDF_FUNC_RANGE();
@@ -51,7 +52,7 @@ table::table(std::vector<std::unique_ptr<column>>&& columns) : _columns{std::mov
 }
 
 // Copy the contents of a `table_view`
-table::table(table_view view, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+table::table(table_view view, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   : _num_rows{view.num_rows()}
 {
   CUDF_FUNC_RANGE();
diff --git a/cpp/src/text/bpe/byte_pair_encoding.cu b/cpp/src/text/bpe/byte_pair_encoding.cu
index 363e15d74c1..e196eee275f 100644
--- a/cpp/src/text/bpe/byte_pair_encoding.cu
+++ b/cpp/src/text/bpe/byte_pair_encoding.cu
@@ -34,6 +34,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -341,7 +342,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
                                                  bpe_merge_pairs const& merge_pairs,
                                                  cudf::string_scalar const& separator,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   if (input.is_empty() || input.chars_size(stream) == 0) {
     return cudf::make_empty_column(cudf::type_id::STRING);
@@ -458,7 +459,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
 std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const& input,
                                                  bpe_merge_pairs const& merges_table,
                                                  cudf::string_scalar const& separator,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::byte_pair_encoding(input, merges_table, separator, cudf::get_default_stream(), mr);
diff --git a/cpp/src/text/bpe/load_merge_pairs.cu b/cpp/src/text/bpe/load_merge_pairs.cu
index 1658f20182b..f34c5c4f7f6 100644
--- a/cpp/src/text/bpe/load_merge_pairs.cu
+++ b/cpp/src/text/bpe/load_merge_pairs.cu
@@ -28,6 +28,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 
@@ -99,7 +100,7 @@ std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_im
 std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_impl(
   cudf::strings_column_view const& input,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto pairs   = cudf::strings::split_record(input, cudf::string_scalar(" "), 1, stream, mr);
   auto content = pairs->release();
@@ -110,7 +111,7 @@ std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_im
 
 std::unique_ptr<bpe_merge_pairs> load_merge_pairs(cudf::strings_column_view const& merge_pairs,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!merge_pairs.is_empty(), "Merge pairs must not be empty");
   CUDF_EXPECTS(!merge_pairs.has_nulls(), "Merge pairs may not contain nulls");
@@ -121,7 +122,7 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs(cudf::strings_column_view cons
 
 std::unique_ptr<bpe_merge_pairs> load_merge_pairs(cudf::strings_column_view const& merge_pairs,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::load_merge_pairs(merge_pairs, stream, mr);
@@ -142,14 +143,14 @@ bpe_merge_pairs::bpe_merge_pairs_impl::bpe_merge_pairs_impl(
 
 bpe_merge_pairs::bpe_merge_pairs(std::unique_ptr<cudf::column>&& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource*)
+                                 rmm::device_async_resource_ref)
   : impl(detail::create_bpe_merge_pairs_impl(std::move(input), stream).release())
 {
 }
 
 bpe_merge_pairs::bpe_merge_pairs(cudf::strings_column_view const& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
   : impl(detail::create_bpe_merge_pairs_impl(input, stream, mr).release())
 {
 }
diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index b9964352c74..63fe3113697 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -32,6 +32,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -132,7 +133,7 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string
                                          cudf::column_view const& row_indices,
                                          cudf::string_scalar const& separator,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(separator.is_valid(stream), "Parameter separator must be valid");
   CUDF_EXPECTS(row_indices.size() == strings.size(),
@@ -173,7 +174,7 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& input,
                                          cudf::column_view const& row_indices,
                                          cudf::string_scalar const& separator,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::detokenize(input, row_indices, separator, stream, mr);
diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu
index 606bebe2174..8d857175407 100644
--- a/cpp/src/text/edit_distance.cu
+++ b/cpp/src/text/edit_distance.cu
@@ -28,6 +28,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -138,7 +139,7 @@ struct edit_distance_matrix_levenshtein_algorithm {
 std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& strings,
                                             cudf::strings_column_view const& targets,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   auto const strings_count = strings.size();
   if (strings_count == 0) {
@@ -203,7 +204,7 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& str
  */
 std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view const& strings,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   cudf::size_type strings_count = strings.size();
   if (strings_count == 0) {
@@ -301,7 +302,7 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
 std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& input,
                                             cudf::strings_column_view const& targets,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::edit_distance(input, targets, stream, mr);
@@ -312,7 +313,7 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& inp
  */
 std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view const& input,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::edit_distance_matrix(input, stream, mr);
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index d2a0ef71e4a..d9fcd7dfd05 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -34,6 +34,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/functional.h>
@@ -90,7 +91,7 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
                                               cudf::size_type ngrams,
                                               cudf::string_scalar const& separator,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(
     separator.is_valid(stream), "Parameter separator must be valid", std::invalid_argument);
@@ -154,7 +155,7 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
                                               cudf::size_type ngrams,
                                               cudf::string_scalar const& separator,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::generate_ngrams(strings, ngrams, separator, stream, mr);
@@ -204,7 +205,7 @@ struct character_ngram_generator_fn {
 std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& input,
                                                         cudf::size_type ngrams,
                                                         rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(ngrams >= 2,
                "Parameter ngrams should be an integer value of 2 or greater",
@@ -278,7 +279,7 @@ struct character_ngram_hash_fn {
 std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& input,
                                                     cudf::size_type ngrams,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(ngrams >= 2,
                "Parameter ngrams should be an integer value of 2 or greater",
@@ -325,7 +326,7 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
 std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
                                                         cudf::size_type ngrams,
                                                         rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::generate_character_ngrams(strings, ngrams, stream, mr);
@@ -334,7 +335,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
 std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
                                                     cudf::size_type ngrams,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::hash_character_ngrams(strings, ngrams, stream, mr);
diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu
index 612eb52af01..9cf934165f6 100644
--- a/cpp/src/text/jaccard.cu
+++ b/cpp/src/text/jaccard.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <thrust/binary_search.h>
@@ -247,7 +248,7 @@ std::unique_ptr<cudf::column> jaccard_index(cudf::strings_column_view const& inp
                                             cudf::strings_column_view const& input2,
                                             cudf::size_type width,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(
     input1.size() == input2.size(), "input columns must be the same size", std::invalid_argument);
@@ -297,7 +298,7 @@ std::unique_ptr<cudf::column> jaccard_index(cudf::strings_column_view const& inp
                                             cudf::strings_column_view const& input2,
                                             cudf::size_type width,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::jaccard_index(input1, input2, width, stream, mr);
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 8d22c784584..4318123627d 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -33,6 +33,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <thrust/execution_policy.h>
@@ -127,7 +128,7 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
                                          cudf::device_span<hash_value_type const> seeds,
                                          cudf::size_type width,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument);
   CUDF_EXPECTS(width >= 2,
@@ -162,7 +163,7 @@ std::unique_ptr<cudf::column> build_list_result(cudf::strings_column_view const&
                                                 std::unique_ptr<cudf::column>&& hashes,
                                                 cudf::size_type seeds_size,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   // build the offsets for the output lists column
   auto const zero = cudf::numeric_scalar<cudf::size_type>(0, true, stream);
@@ -190,7 +191,7 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
                                       cudf::numeric_scalar<uint32_t> const& seed,
                                       cudf::size_type width,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
   auto const seeds   = cudf::device_span<uint32_t const>{seed.data(), 1};
@@ -203,7 +204,7 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
                                       cudf::device_span<uint32_t const> seeds,
                                       cudf::size_type width,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
   auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
@@ -214,7 +215,7 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         cudf::numeric_scalar<uint64_t> const& seed,
                                         cudf::size_type width,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
   auto const seeds   = cudf::device_span<uint64_t const>{seed.data(), 1};
@@ -227,7 +228,7 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         cudf::device_span<uint64_t const> seeds,
                                         cudf::size_type width,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
   auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
@@ -239,7 +240,7 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
                                       cudf::numeric_scalar<uint32_t> seed,
                                       cudf::size_type width,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::minhash(input, seed, width, stream, mr);
@@ -249,7 +250,7 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
                                       cudf::device_span<uint32_t const> seeds,
                                       cudf::size_type width,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::minhash(input, seeds, width, stream, mr);
@@ -259,7 +260,7 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         cudf::numeric_scalar<uint64_t> seed,
                                         cudf::size_type width,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::minhash64(input, seed, width, stream, mr);
@@ -269,7 +270,7 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         cudf::device_span<uint64_t const> seeds,
                                         cudf::size_type width,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::minhash64(input, seeds, width, stream, mr);
diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
index 75ad542548b..95dd8ff3d6c 100644
--- a/cpp/src/text/ngrams_tokenize.cu
+++ b/cpp/src/text/ngrams_tokenize.cu
@@ -33,6 +33,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -141,7 +142,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
                                               cudf::string_scalar const& delimiter,
                                               cudf::string_scalar const& separator,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   cudf::string_view d_delimiter(delimiter.data(), delimiter.size());
@@ -248,7 +249,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
                                               cudf::string_scalar const& delimiter,
                                               cudf::string_scalar const& separator,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::ngrams_tokenize(strings, ngrams, delimiter, separator, stream, mr);
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index c06a24382ed..e5e72d3a33e 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -36,6 +36,7 @@
 #include <nvtext/normalize.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -174,7 +175,7 @@ struct codepoint_to_utf8_fn {
 // detail API
 std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const& strings,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
 
@@ -198,7 +199,7 @@ std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const&
 std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view const& strings,
                                                    bool do_lower_case,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
 
@@ -240,7 +241,7 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
 
 std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const& input,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::normalize_spaces(input, stream, mr);
@@ -252,7 +253,7 @@ std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const&
 std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view const& input,
                                                    bool do_lower_case,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::normalize_characters(input, do_lower_case, stream, mr);
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index 5aed701c037..f61fa544e73 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -32,6 +32,7 @@
 #include <nvtext/replace.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
@@ -202,7 +203,7 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
                                              cudf::strings_column_view const& replacements,
                                              cudf::string_scalar const& delimiter,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!targets.has_nulls(), "Parameter targets must not have nulls");
   CUDF_EXPECTS(!replacements.has_nulls(), "Parameter replacements must not have nulls");
@@ -244,7 +245,7 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& str
                                             cudf::string_scalar const& replacement,
                                             cudf::string_scalar const& delimiter,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(replacement.is_valid(stream), "Parameter replacement must be valid");
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
@@ -281,7 +282,7 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& in
                                              cudf::strings_column_view const& replacements,
                                              cudf::string_scalar const& delimiter,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_tokens(input, targets, replacements, delimiter, stream, mr);
@@ -292,7 +293,7 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& inp
                                             cudf::string_scalar const& replacement,
                                             cudf::string_scalar const& delimiter,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::filter_tokens(input, min_token_length, replacement, delimiter, stream, mr);
diff --git a/cpp/src/text/stemmer.cu b/cpp/src/text/stemmer.cu
index 5c67b2e5f54..4746b6b74b9 100644
--- a/cpp/src/text/stemmer.cu
+++ b/cpp/src/text/stemmer.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/constant_iterator.h>
@@ -99,7 +100,7 @@ std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& strings
                                         letter_type ltype,
                                         PositionIterator position_itr,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::BOOL8});
 
@@ -133,7 +134,7 @@ struct dispatch_is_letter_fn {
                                            letter_type ltype,
                                            cudf::column_view const& indices,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr) const
+                                           rmm::device_async_resource_ref mr) const
   {
     CUDF_EXPECTS(strings.size() == indices.size(),
                  "strings column and indices column must be the same size");
@@ -211,7 +212,7 @@ struct porter_stemmer_measure_fn {
 
 std::unique_ptr<cudf::column> porter_stemmer_measure(cudf::strings_column_view const& strings,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   if (strings.is_empty()) {
     return cudf::make_empty_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()});
@@ -240,7 +241,7 @@ std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& strings
                                         letter_type ltype,
                                         cudf::column_view const& indices,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   return cudf::type_dispatcher(
     indices.type(), dispatch_is_letter_fn{}, strings, ltype, indices, stream, mr);
@@ -254,7 +255,7 @@ std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& input,
                                         letter_type ltype,
                                         cudf::size_type character_index,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_letter(
@@ -265,7 +266,7 @@ std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& input,
                                         letter_type ltype,
                                         cudf::column_view const& indices,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_letter(input, ltype, indices, stream, mr);
@@ -276,7 +277,7 @@ std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& input,
  */
 std::unique_ptr<cudf::column> porter_stemmer_measure(cudf::strings_column_view const& input,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::porter_stemmer_measure(input, stream, mr);
diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu
index 0b4f9f729c3..a08fdea3e84 100644
--- a/cpp/src/text/subword/load_hash_file.cu
+++ b/cpp/src/text/subword/load_hash_file.cu
@@ -28,6 +28,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/fill.h>
 
@@ -180,7 +181,7 @@ uint64_t str_to_uint64(std::string const& str, uint64_t line_no)
 std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
   std::string const& filename_hashed_vocabulary,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   hashed_vocabulary result;
   std::ifstream hash_file(filename_hashed_vocabulary);
@@ -288,7 +289,7 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
 }  // namespace detail
 
 std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
-  std::string const& filename_hashed_vocabulary, rmm::mr::device_memory_resource* mr)
+  std::string const& filename_hashed_vocabulary, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::load_vocabulary_file(filename_hashed_vocabulary, cudf::get_default_stream(), mr);
diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
index a623450ecad..e05427eb6ac 100644
--- a/cpp/src/text/subword/subword_tokenize.cu
+++ b/cpp/src/text/subword/subword_tokenize.cu
@@ -31,6 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
@@ -139,7 +140,7 @@ CUDF_KERNEL void kernel_compute_tensor_metadata(
 tokenizer_result build_empty_result(cudf::size_type size,
                                     uint32_t max_sequence_length,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto zero = cudf::numeric_scalar<uint32_t>(0, true, stream);
   auto ids  = cudf::detail::sequence(size * max_sequence_length, zero, zero, stream, mr);
@@ -166,7 +167,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
                                   bool do_lower_case,
                                   bool do_truncate,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(stride <= max_sequence_length,
                "stride must be less than or equal to max_sequence_length");
@@ -292,7 +293,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
                                   uint32_t stride,
                                   bool do_lower_case,
                                   bool do_truncate,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::subword_tokenize(strings,
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index 82c51e72b31..0b16305a81a 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -33,6 +33,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -48,7 +49,7 @@ template <typename TokenCounter>
 std::unique_ptr<cudf::column> token_count_fn(cudf::size_type strings_count,
                                              TokenCounter tokenizer,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   // create output column
   auto token_counts =
@@ -72,7 +73,7 @@ template <typename Tokenizer>
 std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
                                           Tokenizer tokenizer,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   // get the number of tokens in each string
   auto const token_counts =
@@ -106,7 +107,7 @@ std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
                                        cudf::string_scalar const& delimiter,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   cudf::string_view d_delimiter(delimiter.data(), delimiter.size());
@@ -118,7 +119,7 @@ std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
                                            cudf::string_scalar const& delimiter,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   cudf::string_view d_delimiter(delimiter.data(), delimiter.size());
@@ -131,7 +132,7 @@ std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& stri
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
                                        cudf::strings_column_view const& delimiters,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiters.size() > 0, "Parameter delimiters must not be empty");
   CUDF_EXPECTS(!delimiters.has_nulls(), "Parameter delimiters must not have nulls");
@@ -150,7 +151,7 @@ std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
                                            cudf::strings_column_view const& delimiters,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiters.size() > 0, "Parameter delimiters must not be empty");
   CUDF_EXPECTS(!delimiters.has_nulls(), "Parameter delimiters must not have nulls");
@@ -168,7 +169,7 @@ std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& stri
 // tokenize on every character
 std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const& strings_column,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   auto strings_count = strings_column.size();
   if (strings_count == 0) {
@@ -230,7 +231,7 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& input,
                                        cudf::string_scalar const& delimiter,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::tokenize(input, delimiter, stream, mr);
@@ -239,7 +240,7 @@ std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& input,
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& input,
                                        cudf::strings_column_view const& delimiters,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::tokenize(input, delimiters, stream, mr);
@@ -248,7 +249,7 @@ std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& input,
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& input,
                                            cudf::string_scalar const& delimiter,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::count_tokens(input, delimiter, stream, mr);
@@ -257,7 +258,7 @@ std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& inpu
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& input,
                                            cudf::strings_column_view const& delimiters,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::count_tokens(input, delimiters, stream, mr);
@@ -265,7 +266,7 @@ std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& inpu
 
 std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const& input,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::character_tokenize(input, stream, mr);
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index c99adda3fad..8913ce22da8 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -36,6 +36,7 @@
 #include <nvtext/tokenize.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <cuco/static_map.cuh>
@@ -134,7 +135,7 @@ struct key_pair {
 
 tokenize_vocabulary::tokenize_vocabulary(cudf::strings_column_view const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(not input.is_empty(), "vocabulary must not be empty");
   CUDF_EXPECTS(not input.has_nulls(), "vocabulary must not have nulls");
@@ -165,7 +166,7 @@ tokenize_vocabulary::~tokenize_vocabulary() { delete _impl; }
 
 std::unique_ptr<tokenize_vocabulary> load_vocabulary(cudf::strings_column_view const& input,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return std::make_unique<tokenize_vocabulary>(input, stream, mr);
@@ -358,7 +359,7 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view
                                                        cudf::string_scalar const& delimiter,
                                                        cudf::size_type default_id,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
@@ -467,7 +468,7 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view
                                                        cudf::string_scalar const& delimiter,
                                                        cudf::size_type default_id,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::tokenize_with_vocabulary(input, vocabulary, delimiter, default_id, stream, mr);
diff --git a/cpp/src/transform/bools_to_mask.cu b/cpp/src/transform/bools_to_mask.cu
index e558b51fbb0..c12f65deb46 100644
--- a/cpp/src/transform/bools_to_mask.cu
+++ b/cpp/src/transform/bools_to_mask.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,11 +27,12 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
-  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type().id() == type_id::BOOL8, "Input is not of type bool");
 
@@ -58,7 +59,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
 }  // namespace detail
 
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
-  column_view const& input, rmm::mr::device_memory_resource* mr)
+  column_view const& input, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::bools_to_mask(input, cudf::get_default_stream(), mr);
diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu
index eaf47adec10..7960731f3a1 100644
--- a/cpp/src/transform/compute_column.cu
+++ b/cpp/src/transform/compute_column.cu
@@ -34,6 +34,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -83,7 +84,7 @@ __launch_bounds__(max_block_size) CUDF_KERNEL
 std::unique_ptr<column> compute_column(table_view const& table,
                                        ast::expression const& expr,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   // If evaluating the expression may produce null outputs we create a nullable
   // output column and follow the null-supporting expression evaluation code
@@ -137,7 +138,7 @@ std::unique_ptr<column> compute_column(table_view const& table,
 
 std::unique_ptr<column> compute_column(table_view const& table,
                                        ast::expression const& expr,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::compute_column(table, expr, cudf::get_default_stream(), mr);
diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu
index c0e0c83c416..7a044b9f6f7 100644
--- a/cpp/src/transform/encode.cu
+++ b/cpp/src/transform/encode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <numeric>
@@ -40,8 +41,9 @@
 namespace cudf {
 namespace detail {
 
-std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(
-  table_view const& input_table, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(table_view const& input_table,
+                                                                  rmm::cuda_stream_view stream,
+                                                                  rmm::device_async_resource_ref mr)
 {
   auto const num_cols = input_table.num_columns();
 
@@ -70,7 +72,7 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(
 }  // namespace detail
 
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
-  cudf::table_view const& input, rmm::mr::device_memory_resource* mr)
+  cudf::table_view const& input, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::encode(input, cudf::get_default_stream(), mr);
diff --git a/cpp/src/transform/mask_to_bools.cu b/cpp/src/transform/mask_to_bools.cu
index 73c1a83cfe1..adf5db02d9c 100644
--- a/cpp/src/transform/mask_to_bools.cu
+++ b/cpp/src/transform/mask_to_bools.cu
@@ -24,6 +24,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -35,7 +36,7 @@ std::unique_ptr<column> mask_to_bools(bitmask_type const* bitmask,
                                       size_type begin_bit,
                                       size_type end_bit,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   auto const length = end_bit - begin_bit;
   CUDF_EXPECTS(length >= 0, "begin_bit should be less than or equal to end_bit");
@@ -61,7 +62,7 @@ std::unique_ptr<column> mask_to_bools(bitmask_type const* bitmask,
 std::unique_ptr<column> mask_to_bools(bitmask_type const* bitmask,
                                       size_type begin_bit,
                                       size_type end_bit,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::mask_to_bools(bitmask, begin_bit, end_bit, cudf::get_default_stream(), mr);
diff --git a/cpp/src/transform/nans_to_nulls.cu b/cpp/src/transform/nans_to_nulls.cu
index 3c02409f778..fd4f33c594c 100644
--- a/cpp/src/transform/nans_to_nulls.cu
+++ b/cpp/src/transform/nans_to_nulls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -37,7 +38,7 @@ struct dispatch_nan_to_null {
                    std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type>>
   operator()(column_view const& input,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
+             rmm::device_async_resource_ref mr)
   {
     auto input_device_view_ptr = column_device_view::create(input, stream);
     auto input_device_view     = *input_device_view_ptr;
@@ -75,14 +76,14 @@ struct dispatch_nan_to_null {
                    std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type>>
   operator()(column_view const& input,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
+             rmm::device_async_resource_ref mr)
   {
     CUDF_FAIL("Input column can't be a non-floating type");
   }
 };
 
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> nans_to_nulls(
-  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return std::pair(std::make_unique<rmm::device_buffer>(), 0); }
 
@@ -92,7 +93,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> nans_to_nulls(
 }  // namespace detail
 
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> nans_to_nulls(
-  column_view const& input, rmm::mr::device_memory_resource* mr)
+  column_view const& input, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::nans_to_nulls(input, cudf::get_default_stream(), mr);
diff --git a/cpp/src/transform/one_hot_encode.cu b/cpp/src/transform/one_hot_encode.cu
index 72f864346a4..570060b3870 100644
--- a/cpp/src/transform/one_hot_encode.cu
+++ b/cpp/src/transform/one_hot_encode.cu
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -58,7 +59,7 @@ struct ohe_equality_functor {
 std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const& input,
                                                               column_view const& categories,
                                                               rmm::cuda_stream_view stream,
-                                                              rmm::mr::device_memory_resource* mr)
+                                                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type() == categories.type(), "Mismatch type between input and categories.");
 
@@ -110,7 +111,7 @@ std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const&
 
 std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const& input,
                                                               column_view const& categories,
-                                                              rmm::mr::device_memory_resource* mr)
+                                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::one_hot_encode(input, categories, cudf::get_default_stream(), mr);
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 78bd558501b..bfac7ab586e 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -477,7 +478,7 @@ CUDF_KERNEL void compute_segment_sizes(device_span<column_device_view const> col
 std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
                                                 size_type segment_length,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   // If there is no rows, segment_length will not be checked.
   if (t.num_rows() <= 0) { return cudf::make_empty_column(type_id::INT32); }
@@ -557,7 +558,7 @@ std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
 
 std::unique_ptr<column> row_bit_count(table_view const& t,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   return segmented_row_bit_count(t, 1, stream, mr);
 }
@@ -566,13 +567,13 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
 
 std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
                                                 size_type segment_length,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_row_bit_count(t, segment_length, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> row_bit_count(table_view const& t, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> row_bit_count(table_view const& t, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::row_bit_count(t, cudf::get_default_stream(), mr);
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 6f61ed80dd8..072eb73453b 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -28,6 +28,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <jit_preprocessed_files/transform/jit/kernel.cu.jit.hpp>
 
@@ -73,7 +74,7 @@ std::unique_ptr<column> transform(column_view const& input,
                                   data_type output_type,
                                   bool is_ptx,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_fixed_width(input.type()), "Unexpected non-fixed-width type.");
 
@@ -96,7 +97,7 @@ std::unique_ptr<column> transform(column_view const& input,
                                   std::string const& unary_udf,
                                   data_type output_type,
                                   bool is_ptx,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::transform(input, unary_udf, output_type, is_ptx, cudf::get_default_stream(), mr);
diff --git a/cpp/src/transpose/transpose.cu b/cpp/src/transpose/transpose.cu
index 45c2e650095..abde43535be 100644
--- a/cpp/src/transpose/transpose.cu
+++ b/cpp/src/transpose/transpose.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -34,7 +35,7 @@ namespace cudf {
 namespace detail {
 std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr)
+                                                         rmm::device_async_resource_ref mr)
 {
   // If there are no rows in the input, return successfully
   if (input.num_columns() == 0 || input.num_rows() == 0) {
@@ -60,7 +61,7 @@ std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input
 }  // namespace detail
 
 std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input,
-                                                         rmm::mr::device_memory_resource* mr)
+                                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::transpose(input, cudf::get_default_stream(), mr);
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index b6c9b3caa20..98c412f805d 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -29,6 +29,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
@@ -184,7 +185,7 @@ template <typename T, std::enable_if_t<is_fixed_point<T>()>* = nullptr>
 std::unique_ptr<column> rescale(column_view input,
                                 numeric::scale_type scale,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   using namespace numeric;
   using RepType = device_storage_type_t<T>;
@@ -229,7 +230,7 @@ struct dispatch_unary_cast_to {
             std::enable_if_t<is_supported_non_fixed_point_cast<SourceT, TargetT>()>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto const size = input.size();
     auto output =
@@ -256,7 +257,7 @@ struct dispatch_unary_cast_to {
     std::enable_if_t<cudf::is_fixed_point<SourceT>() && cudf::is_numeric<TargetT>()>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto const size = input.size();
     auto output =
@@ -286,7 +287,7 @@ struct dispatch_unary_cast_to {
     std::enable_if_t<cudf::is_numeric<SourceT>() && cudf::is_fixed_point<TargetT>()>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto const size = input.size();
     auto output =
@@ -316,7 +317,7 @@ struct dispatch_unary_cast_to {
                              std::is_same_v<SourceT, TargetT>>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     if (input.type() == type) {
       return std::make_unique<column>(input, stream, mr);  // TODO add test for this
@@ -331,7 +332,7 @@ struct dispatch_unary_cast_to {
                              not std::is_same_v<SourceT, TargetT>>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     using namespace numeric;
     using SourceDeviceT = device_storage_type_t<SourceT>;
@@ -374,7 +375,7 @@ struct dispatch_unary_cast_to {
             std::enable_if_t<not is_supported_cast<SourceT, TargetT>()>* = nullptr>
   std::unique_ptr<column> operator()(data_type,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
 
   {
     if (!cudf::is_fixed_width<TargetT>())
@@ -396,7 +397,7 @@ struct dispatch_unary_cast_from {
   template <typename T, std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return type_dispatcher(type, dispatch_unary_cast_to<T>{input}, type, stream, mr);
   }
@@ -412,7 +413,7 @@ struct dispatch_unary_cast_from {
 std::unique_ptr<column> cast(column_view const& input,
                              data_type type,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_fixed_width(type), "Unary cast type must be fixed-width.");
 
@@ -424,7 +425,7 @@ std::unique_ptr<column> cast(column_view const& input,
 std::unique_ptr<column> cast(column_view const& input,
                              data_type type,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::cast(input, type, stream, mr);
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index 88922362319..ab17da5f8c4 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
@@ -280,7 +281,7 @@ struct fixed_point_abs {
 template <typename T, template <typename> typename FixedPointFunctor>
 std::unique_ptr<column> unary_op_with(column_view const& input,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   using Type                     = device_storage_type_t<T>;
   using FixedPointUnaryOpFunctor = FixedPointFunctor<Type>;
@@ -322,7 +323,7 @@ std::unique_ptr<cudf::column> transform_fn(InputIterator begin,
                                            rmm::device_buffer&& null_mask,
                                            size_type null_count,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto const size = cudf::distance(begin, end);
 
@@ -344,7 +345,7 @@ std::unique_ptr<cudf::column> transform_fn(InputIterator begin,
 template <typename T, typename UFN>
 std::unique_ptr<cudf::column> transform_fn(cudf::dictionary_column_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto dictionary_view = cudf::column_device_view::create(input.parent(), stream);
   auto dictionary_itr  = dictionary::detail::make_dictionary_iterator<T>(*dictionary_view);
@@ -365,7 +366,7 @@ struct MathOpDispatcher {
   template <typename T, std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     return transform_fn<T, UFN>(input.begin<T>(),
                                 input.end<T>(),
@@ -379,7 +380,7 @@ struct MathOpDispatcher {
     template <typename T, std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
     std::unique_ptr<cudf::column> operator()(cudf::dictionary_column_view const& input,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
     {
       return transform_fn<T, UFN>(input, stream, mr);
     }
@@ -396,7 +397,7 @@ struct MathOpDispatcher {
     std::enable_if_t<!std::is_arithmetic_v<T> and std::is_same_v<T, dictionary32>>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     if (input.is_empty()) return empty_like(input);
     auto dictionary_col = dictionary_column_view(input);
@@ -418,7 +419,7 @@ struct BitwiseOpDispatcher {
   template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     return transform_fn<T, UFN>(input.begin<T>(),
                                 input.end<T>(),
@@ -432,7 +433,7 @@ struct BitwiseOpDispatcher {
     template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
     std::unique_ptr<cudf::column> operator()(cudf::dictionary_column_view const& input,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
     {
       return transform_fn<T, UFN>(input, stream, mr);
     }
@@ -448,7 +449,7 @@ struct BitwiseOpDispatcher {
             std::enable_if_t<!std::is_integral_v<T> and std::is_same_v<T, dictionary32>>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     if (input.is_empty()) return empty_like(input);
     auto dictionary_col = dictionary_column_view(input);
@@ -478,7 +479,7 @@ struct LogicalOpDispatcher {
   template <typename T, std::enable_if_t<is_supported<T>()>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     return transform_fn<bool, UFN>(input.begin<T>(),
                                    input.end<T>(),
@@ -493,7 +494,7 @@ struct LogicalOpDispatcher {
     template <typename T, std::enable_if_t<is_supported<T>()>* = nullptr>
     std::unique_ptr<cudf::column> operator()(cudf::dictionary_column_view const& input,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
     {
       auto dictionary_view = cudf::column_device_view::create(input.parent(), stream);
       auto dictionary_itr  = dictionary::detail::make_dictionary_iterator<T>(*dictionary_view);
@@ -516,7 +517,7 @@ struct LogicalOpDispatcher {
             std::enable_if_t<!is_supported<T>() and std::is_same_v<T, dictionary32>>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     if (input.is_empty()) return make_empty_column(cudf::data_type{cudf::type_id::BOOL8});
     auto dictionary_col = dictionary_column_view(input);
@@ -545,7 +546,7 @@ struct FixedPointOpDispatcher {
     column_view const& input,
     cudf::unary_operator op,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     // clang-format off
     switch (op) {
@@ -563,7 +564,7 @@ struct FixedPointOpDispatcher {
 std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
                                               cudf::unary_operator op,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   if (cudf::is_fixed_point(input.type()))
     return type_dispatcher(input.type(), detail::FixedPointOpDispatcher{}, input, op, stream, mr);
@@ -647,7 +648,7 @@ std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
 std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
                                               cudf::unary_operator op,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::unary_operation(input, op, stream, mr);
diff --git a/cpp/src/unary/nan_ops.cu b/cpp/src/unary/nan_ops.cu
index 092ad3b6731..08aa8755624 100644
--- a/cpp/src/unary/nan_ops.cu
+++ b/cpp/src/unary/nan_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -33,7 +34,7 @@ struct nan_dispatcher {
     cudf::column_view const& input,
     Predicate predicate,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     auto input_device_view = column_device_view::create(input, stream);
 
@@ -61,7 +62,7 @@ struct nan_dispatcher {
     cudf::column_view const& input,
     Predicate predicate,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     CUDF_FAIL("NAN is not supported in a Non-floating point type column");
   }
@@ -69,7 +70,7 @@ struct nan_dispatcher {
 
 std::unique_ptr<column> is_nan(cudf::column_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   auto predicate = [] __device__(auto element_validity_pair) {
     return element_validity_pair.second and std::isnan(element_validity_pair.first);
@@ -80,7 +81,7 @@ std::unique_ptr<column> is_nan(cudf::column_view const& input,
 
 std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   auto predicate = [] __device__(auto element_validity_pair) {
     return !element_validity_pair.second or !std::isnan(element_validity_pair.first);
@@ -93,7 +94,7 @@ std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
 
 std::unique_ptr<column> is_nan(cudf::column_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_nan(input, stream, mr);
@@ -101,7 +102,7 @@ std::unique_ptr<column> is_nan(cudf::column_view const& input,
 
 std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_not_nan(input, stream, mr);
diff --git a/cpp/src/unary/null_ops.cu b/cpp/src/unary/null_ops.cu
index 6bdd65dd42d..a223a090128 100644
--- a/cpp/src/unary/null_ops.cu
+++ b/cpp/src/unary/null_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,13 +19,15 @@
 #include <cudf/detail/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/iterator/counting_iterator.h>
 
 namespace cudf {
 namespace detail {
 std::unique_ptr<column> is_null(cudf::column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   auto input_device_view = column_device_view::create(input, stream);
   auto device_view       = *input_device_view;
@@ -40,7 +42,7 @@ std::unique_ptr<column> is_null(cudf::column_view const& input,
 
 std::unique_ptr<column> is_valid(cudf::column_view const& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   auto input_device_view = column_device_view::create(input, stream);
   auto device_view       = *input_device_view;
@@ -57,7 +59,7 @@ std::unique_ptr<column> is_valid(cudf::column_view const& input,
 
 std::unique_ptr<column> is_null(cudf::column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_null(input, stream, mr);
@@ -65,7 +67,7 @@ std::unique_ptr<column> is_null(cudf::column_view const& input,
 
 std::unique_ptr<column> is_valid(cudf::column_view const& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_valid(input, stream, mr);
diff --git a/cpp/src/unary/unary_ops.cuh b/cpp/src/unary/unary_ops.cuh
index d0003bb6b41..61c41705665 100644
--- a/cpp/src/unary/unary_ops.cuh
+++ b/cpp/src/unary/unary_ops.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
@@ -34,7 +35,7 @@ struct launcher {
   static std::unique_ptr<cudf::column> launch(cudf::column_view const& input,
                                               cudf::unary_operator op,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
   {
     std::unique_ptr<cudf::column> output = [&] {
       if (op == cudf::unary_operator::NOT) {
diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp
index f904696593c..9c2b16df1e1 100644
--- a/cpp/tests/copying/shift_tests.cpp
+++ b/cpp/tests/copying/shift_tests.cpp
@@ -26,6 +26,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <limits>
 #include <memory>
@@ -35,8 +36,8 @@ using TestTypes = cudf::test::Types<int32_t>;
 
 template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
 std::unique_ptr<cudf::scalar> make_scalar(
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   auto s = new ScalarType(cudf::test::make_type_param_scalar<T>(0), false, stream, mr);
   return std::unique_ptr<cudf::scalar>(s);
@@ -45,8 +46,8 @@ std::unique_ptr<cudf::scalar> make_scalar(
 template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
 std::unique_ptr<cudf::scalar> make_scalar(
   T value,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   auto s = new ScalarType(value, true, stream, mr);
   return std::unique_ptr<cudf::scalar>(s);
diff --git a/cpp/tests/io/json_chunked_reader.cpp b/cpp/tests/io/json_chunked_reader.cpp
index 8d8fdd2a0e1..ef69ee5239d 100644
--- a/cpp/tests/io/json_chunked_reader.cpp
+++ b/cpp/tests/io/json_chunked_reader.cpp
@@ -22,6 +22,8 @@
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 /**
  * @brief Base test fixture for JSON reader tests
  */
@@ -35,7 +37,7 @@ std::vector<cudf::io::table_with_metadata> skeleton_for_parellel_chunk_reader(
   cudf::io::json_reader_options const& reader_opts,
   int32_t chunk_size,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   using namespace cudf::io::json::detail;
   using cudf::size_type;
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index b42f378d872..c35ad5319e4 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -38,6 +38,8 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <limits>
 
 template <typename T>
@@ -59,7 +61,7 @@ template <std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
             cudf::table_view const& left_keys,
             cudf::table_view const& right_keys,
             cudf::null_equality compare_nulls,
-            rmm::mr::device_memory_resource* mr),
+            rmm::device_async_resource_ref mr),
           cudf::out_of_bounds_policy oob_policy = cudf::out_of_bounds_policy::DONT_CHECK>
 std::unique_ptr<cudf::table> join_and_gather(
   cudf::table_view const& left_input,
@@ -67,7 +69,7 @@ std::unique_ptr<cudf::table> join_and_gather(
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
   cudf::null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   auto left_selected  = left_input.select(left_on);
   auto right_selected = right_input.select(right_on);
diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp
index 5cdf5b2a374..61bb3069308 100644
--- a/cpp/tests/join/semi_anti_join_tests.cpp
+++ b/cpp/tests/join/semi_anti_join_tests.cpp
@@ -29,6 +29,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/iterator/transform_iterator.h>
 
 template <typename T>
@@ -50,14 +52,14 @@ template <std::unique_ptr<rmm::device_uvector<cudf::size_type>> (*join_impl)(
   cudf::table_view const& left_keys,
   cudf::table_view const& right_keys,
   cudf::null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)>
+  rmm::device_async_resource_ref mr)>
 std::unique_ptr<cudf::table> join_and_gather(
   cudf::table_view const& left_input,
   cudf::table_view const& right_input,
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
   cudf::null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   auto left_selected      = left_input.select(left_on);
   auto right_selected     = right_input.select(right_on);
diff --git a/java/src/main/native/include/maps_column_view.hpp b/java/src/main/native/include/maps_column_view.hpp
index 5ac8d5c5713..7d19615053d 100644
--- a/java/src/main/native/include/maps_column_view.hpp
+++ b/java/src/main/native/include/maps_column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -81,9 +83,9 @@ class maps_column_view {
    * @param mr Device memory resource used to allocate the returned column's device memory.
    * @return std::unique_ptr<column> Column of values corresponding the value of the lookup key.
    */
-  std::unique_ptr<column> get_values_for(
-      column_view const &keys, rmm::cuda_stream_view stream = cudf::get_default_stream(),
-      rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) const;
+  std::unique_ptr<column>
+  get_values_for(column_view const &keys, rmm::cuda_stream_view stream = cudf::get_default_stream(),
+                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
   /**
    * @brief Map lookup by a scalar key.
@@ -99,9 +101,9 @@ class maps_column_view {
    * @param mr Device memory resource used to allocate the returned column's device memory.
    * @return std::unique_ptr<column>
    */
-  std::unique_ptr<column> get_values_for(
-      scalar const &key, rmm::cuda_stream_view stream = cudf::get_default_stream(),
-      rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) const;
+  std::unique_ptr<column>
+  get_values_for(scalar const &key, rmm::cuda_stream_view stream = cudf::get_default_stream(),
+                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
   /**
    * @brief Check if each map row contains a specified scalar key.
@@ -121,7 +123,7 @@ class maps_column_view {
    */
   std::unique_ptr<column>
   contains(scalar const &key, rmm::cuda_stream_view stream = cudf::get_default_stream(),
-           rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) const;
+           rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
   /**
    * @brief Check if each map row contains keys specified by a column
@@ -142,7 +144,7 @@ class maps_column_view {
 
   std::unique_ptr<column>
   contains(column_view const &key, rmm::cuda_stream_view stream = cudf::get_default_stream(),
-           rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) const;
+           rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
 private:
   lists_column_view keys_, values_;
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 68af350d5fe..8d7ac8890cc 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -22,6 +22,7 @@
 #include <mutex>
 
 #include <cudf/io/memory_resource.hpp>
+#include <rmm/aligned.hpp>
 #include <rmm/mr/device/aligned_resource_adaptor.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
 #include <rmm/mr/device/cuda_async_memory_resource.hpp>
@@ -33,6 +34,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include "cudf_jni_apis.hpp"
 
@@ -572,9 +574,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocInternal(JNIEnv *env, jclas
                                                               jlong stream) {
   try {
     cudf::jni::auto_set_device(env);
-    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource();
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource();
     auto c_stream = rmm::cuda_stream_view(reinterpret_cast<cudaStream_t>(stream));
-    void *ret = mr->allocate(size, c_stream);
+    void *ret = mr.allocate_async(size, rmm::CUDA_ALLOCATION_ALIGNMENT, c_stream);
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
@@ -584,10 +586,10 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_free(JNIEnv *env, jclass clazz, j
                                                     jlong size, jlong stream) {
   try {
     cudf::jni::auto_set_device(env);
-    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource();
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource();
     void *cptr = reinterpret_cast<void *>(ptr);
     auto c_stream = rmm::cuda_stream_view(reinterpret_cast<cudaStream_t>(stream));
-    mr->deallocate(cptr, size, c_stream);
+    mr.deallocate_async(cptr, size, rmm::CUDA_ALLOCATION_ALIGNMENT, c_stream);
   }
   CATCH_STD(env, )
 }
diff --git a/java/src/main/native/src/maps_column_view.cu b/java/src/main/native/src/maps_column_view.cu
index 1af7689f972..d5600e48a5c 100644
--- a/java/src/main/native/src/maps_column_view.cu
+++ b/java/src/main/native/src/maps_column_view.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <maps_column_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::jni {
 
@@ -49,7 +50,7 @@ maps_column_view::maps_column_view(lists_column_view const &lists_of_structs,
 template <typename KeyT>
 std::unique_ptr<column> get_values_for_impl(maps_column_view const &maps_view,
                                             KeyT const &lookup_keys, rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource *mr) {
+                                            rmm::device_async_resource_ref mr) {
   auto const keys_ = maps_view.keys();
   auto const values_ = maps_view.values();
   CUDF_EXPECTS(lookup_keys.type().id() == keys_.child().type().id(),
@@ -65,25 +66,25 @@ std::unique_ptr<column> get_values_for_impl(maps_column_view const &maps_view,
   return lists::detail::extract_list_element(values_, key_indices->view(), stream, mr);
 }
 
-std::unique_ptr<column>
-maps_column_view::get_values_for(column_view const &lookup_keys, rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource *mr) const {
+std::unique_ptr<column> maps_column_view::get_values_for(column_view const &lookup_keys,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::device_async_resource_ref mr) const {
   CUDF_EXPECTS(lookup_keys.size() == size(),
                "Lookup keys must have the same size as the map column.");
 
   return get_values_for_impl(*this, lookup_keys, stream, mr);
 }
 
-std::unique_ptr<column>
-maps_column_view::get_values_for(scalar const &lookup_key, rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource *mr) const {
+std::unique_ptr<column> maps_column_view::get_values_for(scalar const &lookup_key,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::device_async_resource_ref mr) const {
   return get_values_for_impl(*this, lookup_key, stream, mr);
 }
 
 template <typename KeyT>
 std::unique_ptr<column> contains_impl(maps_column_view const &maps_view, KeyT const &lookup_keys,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource *mr) {
+                                      rmm::device_async_resource_ref mr) {
   auto const keys = maps_view.keys();
   CUDF_EXPECTS(lookup_keys.type().id() == keys.child().type().id(),
                "Lookup keys must have the same type as the keys of the map column.");
@@ -96,7 +97,7 @@ std::unique_ptr<column> contains_impl(maps_column_view const &maps_view, KeyT co
 
 std::unique_ptr<column> maps_column_view::contains(column_view const &lookup_keys,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource *mr) const {
+                                                   rmm::device_async_resource_ref mr) const {
   CUDF_EXPECTS(lookup_keys.size() == size(),
                "Lookup keys must have the same size as the map column.");
 
@@ -105,7 +106,7 @@ std::unique_ptr<column> maps_column_view::contains(column_view const &lookup_key
 
 std::unique_ptr<column> maps_column_view::contains(scalar const &lookup_key,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource *mr) const {
+                                                   rmm::device_async_resource_ref mr) const {
   return contains_impl(*this, lookup_key, stream, mr);
 }
 

From 96b6bec7721fa32352bbe47d6618110a8de7d293 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 17 Apr 2024 05:22:44 -1000
Subject: [PATCH 086/272] Enable pandas plotting unit tests for cudf.pandas
 (#15547)

Locally, these tests ran without any crashed workers/hangs

closes #15428

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15547
---
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 07ec5c8bc0c..784d90a40ed 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -22,8 +22,7 @@ set -euo pipefail
 # of Pandas installed.
 PANDAS_VERSION=$(python -c "import pandas; print(pandas.__version__)")
 
-PYTEST_IGNORES="--ignore=tests/plotting \
---ignore=tests/tslibs/test_parsing.py \
+PYTEST_IGNORES="--ignore=tests/tslibs/test_parsing.py \
 --ignore=tests/io/parser/common/test_read_errors.py"
 
 mkdir -p pandas-testing

From 041eaa4ac31e3f39713225d143e7c4dfb489b33a Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 17 Apr 2024 12:33:22 -0500
Subject: [PATCH 087/272] Rename JSON_READER_OPTION to
 JSON_READER_OPTION_NVBENCH. (#15553)

This renames a benchmark executable for `JSON_READER_OPTION` to indicate that it is an NVBench executable. This naming pattern is significant for our automated benchmarking tools.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/15553
---
 cpp/benchmarks/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 571780888c0..2c78a31f0f8 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -331,7 +331,7 @@ ConfigureNVBench(
 ConfigureBench(JSON_BENCH json/json.cu)
 ConfigureNVBench(FST_NVBENCH io/fst.cu)
 ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader_input.cpp)
-ConfigureNVBench(JSON_READER_OPTION io/json/json_reader_option.cpp)
+ConfigureNVBench(JSON_READER_OPTION_NVBENCH io/json/json_reader_option.cpp)
 ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)
 
 # ##################################################################################################

From e928c4a01bfe528839b812aad8b5135029a0fa78 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 17 Apr 2024 09:17:42 -1000
Subject: [PATCH 088/272] Clean up special casing in `as_column` for non-typed
 input (#15276)

Redo at https://github.com/rapidsai/cudf/pull/14636

Clean up special casing for non-typed inputs to essentially do:

```
try:
     arbitrary = pa.array(arbitrary)
except:
     arbitrary = pd.Series(arbitrary)
return as_column(arbitrary)
```

Additionally, this change matches a behavior with pandas that will parse string data with `dtype=datetime64` type similar to the 2.2 behavior (fail if the resolution of the type doesn't match the string data)

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15276
---
 python/cudf/cudf/core/column/column.py  | 246 ++++++++----------------
 python/cudf/cudf/core/index.py          |  10 +-
 python/cudf/cudf/tests/test_binops.py   |  36 ++--
 python/cudf/cudf/tests/test_column.py   |   2 +-
 python/cudf/cudf/tests/test_datetime.py | 138 ++++++++-----
 python/cudf/cudf/tests/test_series.py   |   4 +-
 python/cudf/cudf/utils/docutils.py      |   9 +-
 7 files changed, 215 insertions(+), 230 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index c8a6493ddda..b5890f7aad4 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -4,7 +4,6 @@
 
 import builtins
 import pickle
-import warnings
 from collections import abc
 from functools import cached_property
 from itertools import chain
@@ -56,7 +55,6 @@
     _is_pandas_nullable_extension_dtype,
     infer_dtype,
     is_bool_dtype,
-    is_datetime64_dtype,
     is_dtype_equal,
     is_integer_dtype,
     is_scalar,
@@ -82,12 +80,13 @@
 from cudf.utils.dtypes import (
     _maybe_convert_to_default_type,
     cudf_dtype_from_pa_type,
+    cudf_dtype_to_pa_type,
     find_common_type,
     get_time_unit,
+    is_column_like,
     is_mixed_with_object_dtype,
     min_scalar_type,
     min_unsigned_type,
-    np_to_pa_dtype,
 )
 from cudf.utils.utils import _array_ufunc, mask_dtype
 
@@ -1923,7 +1922,7 @@ def as_column(
                 # pandas arrays define __arrow_array__ for better
                 # pyarrow.array conversion
                 arbitrary = arbitrary.array
-            data = as_column(
+            return as_column(
                 pa.array(arbitrary, from_pandas=True),
                 nan_as_null=nan_as_null,
                 dtype=dtype,
@@ -1932,7 +1931,7 @@ def as_column(
         elif isinstance(
             arbitrary.dtype, (pd.CategoricalDtype, pd.IntervalDtype)
         ):
-            data = as_column(
+            return as_column(
                 pa.array(arbitrary, from_pandas=True),
                 nan_as_null=nan_as_null,
                 dtype=dtype,
@@ -1956,7 +1955,7 @@ def as_column(
                 arbitrary = np.asarray(arbitrary)
             else:
                 arbitrary = cupy.asarray(arbitrary)
-            data = as_column(
+            return as_column(
                 arbitrary, nan_as_null=nan_as_null, dtype=dtype, length=length
             )
         elif arbitrary.dtype.kind == "O":
@@ -1988,7 +1987,7 @@ def as_column(
                 arbitrary,
                 from_pandas=True,
             )
-            data = as_column(
+            return as_column(
                 pyarrow_array,
                 dtype=dtype,
                 nan_as_null=nan_as_null,
@@ -1999,9 +1998,6 @@ def as_column(
                 f"{type(arbitrary).__name__} with "
                 f"{type(arbitrary.dtype).__name__} is not supported."
             )
-        if dtype is not None:
-            data = data.astype(dtype)
-
     elif is_scalar(arbitrary) and not isinstance(arbitrary, memoryview):
         if length is None:
             length = 1
@@ -2094,6 +2090,13 @@ def as_column(
         return as_column(
             np.asarray(view), dtype=dtype, nan_as_null=nan_as_null
         )
+    elif hasattr(arbitrary, "__array__"):
+        # e.g. test_cuda_array_interface_pytorch
+        try:
+            arbitrary = cupy.asarray(arbitrary)
+        except (ValueError, TypeError):
+            arbitrary = np.asarray(arbitrary)
+        return as_column(arbitrary, dtype=dtype, nan_as_null=nan_as_null)
     # Start of arbitrary that's not handed above but dtype provided
     elif isinstance(dtype, pd.DatetimeTZDtype):
         raise NotImplementedError(
@@ -2126,9 +2129,20 @@ def as_column(
             pd.IntervalDtype,
             cudf.IntervalDtype,
         ),
-    ) or dtype in {"category", "interval", "str", str, np.str_}:
+    ) or dtype in {
+        "category",
+        "interval",
+        "str",
+        str,
+        np.str_,
+        object,
+        np.dtype(object),
+    }:
         if isinstance(dtype, (cudf.CategoricalDtype, cudf.IntervalDtype)):
             dtype = dtype.to_pandas()
+        elif dtype == object:
+            # Unlike pandas, interpret object as "str" instead of "python object"
+            dtype = "str"
         ser = pd.Series(arbitrary, dtype=dtype)
         return as_column(ser, nan_as_null=nan_as_null)
     elif isinstance(dtype, (cudf.StructDtype, cudf.ListDtype)):
@@ -2140,166 +2154,72 @@ def as_column(
                 return cudf.core.column.ListColumn.from_sequences(arbitrary)
             raise
         return as_column(data, nan_as_null=nan_as_null)
-    else:
-        pa_type = None
+    elif not isinstance(arbitrary, (abc.Iterable, abc.Sequence)):
+        # TODO: This validation should probably be done earlier?
+        raise TypeError(
+            f"{type(arbitrary).__name__} must be an iterable or sequence."
+        )
+    from_pandas = nan_as_null is None or nan_as_null
+    if dtype is not None:
+        dtype = cudf.dtype(dtype)
         try:
-            if dtype is not None:
-                if is_datetime64_dtype(dtype):
-                    # Error checking only, actual construction happens
-                    # below.
-                    pa_array = pa.array(arbitrary)
-                    if (
-                        isinstance(pa_array.type, pa.TimestampType)
-                        and pa_array.type.tz is not None
-                    ):
-                        raise NotImplementedError(
-                            "cuDF does not yet support timezone-aware "
-                            "datetimes"
-                        )
-                if is_bool_dtype(dtype):
-                    # Need this special case handling for bool dtypes,
-                    # since 'boolean' & 'pd.BooleanDtype' are not
-                    # understood by np.dtype below.
-                    dtype = "bool"
-                np_dtype = np.dtype(dtype)
-                if np_dtype.kind in {"m", "M"}:
-                    unit = np.datetime_data(np_dtype)[0]
-                    if unit not in {"ns", "us", "ms", "s", "D"}:
-                        raise NotImplementedError(
-                            f"{dtype=} is not supported."
-                        )
-                pa_type = np_to_pa_dtype(np_dtype)
-            else:
-                # By default cudf constructs a 64-bit column. Setting
-                # the `default_*_bitwidth` to 32 will result in a 32-bit
-                # column being created.
-                if (
-                    cudf.get_option("default_integer_bitwidth")
-                    and infer_dtype(arbitrary) == "integer"
-                ):
-                    pa_type = np_to_pa_dtype(
-                        _maybe_convert_to_default_type("int")
-                    )
-                if cudf.get_option("default_float_bitwidth") and infer_dtype(
-                    arbitrary
-                ) in (
-                    "floating",
-                    "mixed-integer-float",
-                ):
-                    pa_type = np_to_pa_dtype(
-                        _maybe_convert_to_default_type("float")
-                    )
-
-            pyarrow_array = pa.array(
+            arbitrary = pa.array(
                 arbitrary,
-                type=pa_type,
-                from_pandas=True if nan_as_null is None else nan_as_null,
+                type=cudf_dtype_to_pa_type(dtype),
+                from_pandas=from_pandas,
             )
-
-            if (
-                isinstance(pyarrow_array, pa.NullArray)
-                and pa_type is None
-                and dtype is None
-                and getattr(arbitrary, "dtype", None) == cudf.dtype("object")
-            ):
-                # pa.array constructor returns a NullArray
-                # for empty arrays, instead of a StringArray.
-                # This issue is only specific to this dtype,
-                # all other dtypes, result in their corresponding
-                # arrow array creation.
-                dtype = cudf.dtype("str")
-                pyarrow_array = pyarrow_array.cast(np_to_pa_dtype(dtype))
-
+        except (pa.ArrowInvalid, pa.ArrowTypeError):
+            if not isinstance(dtype, np.dtype):
+                dtype = dtype.to_pandas()
+            arbitrary = pd.Series(arbitrary, dtype=dtype)
+        return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)
+    else:
+        arbitrary = list(arbitrary)
+        for element in arbitrary:
+            # Carve-outs that cannot be parsed by pyarrow/pandas
+            if is_column_like(element):
+                # e.g. test_nested_series_from_sequence_data
+                return cudf.core.column.ListColumn.from_sequences(arbitrary)
+            elif isinstance(element, cupy.ndarray):
+                # e.g. test_series_from_cupy_scalars
+                return as_column(
+                    cupy.array(arbitrary),
+                    dtype=dtype,
+                    nan_as_null=nan_as_null,
+                    length=length,
+                )
+            elif not any(element is na for na in (None, pd.NA, np.nan)):
+                # Might have NA + element like above, but short-circuit if
+                # an element pyarrow/pandas might be able to parse
+                break
+        try:
+            arbitrary = pa.array(arbitrary, from_pandas=from_pandas)
             if (
                 cudf.get_option("mode.pandas_compatible")
-                and pa.types.is_integer(pyarrow_array.type)
-                and pyarrow_array.null_count
+                and pa.types.is_integer(arbitrary.type)
+                and arbitrary.null_count > 0
             ):
-                pyarrow_array = pyarrow_array.cast("float64").fill_null(np.nan)
-
-            data = as_column(
-                pyarrow_array,
-                dtype=dtype,
-                nan_as_null=nan_as_null,
-            )
-        except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError) as e:
-            if isinstance(e, MixedTypeError):
-                raise TypeError(str(e))
+                arbitrary = arbitrary.cast(pa.float64())
+            if cudf.get_option(
+                "default_integer_bitwidth"
+            ) and pa.types.is_integer(arbitrary.type):
+                dtype = _maybe_convert_to_default_type("int")
+            elif cudf.get_option(
+                "default_float_bitwidth"
+            ) and pa.types.is_floating(arbitrary.type):
+                dtype = _maybe_convert_to_default_type("float")
+        except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
+            arbitrary = pd.Series(arbitrary)
+            if cudf.get_option(
+                "default_integer_bitwidth"
+            ) and arbitrary.dtype.kind in set("iu"):
+                dtype = _maybe_convert_to_default_type("int")
             elif (
-                isinstance(arbitrary, Sequence)
-                and len(arbitrary) > 0
-                and any(
-                    cudf.utils.dtypes.is_column_like(arb) for arb in arbitrary
-                )
+                cudf.get_option("default_float_bitwidth")
+                and arbitrary.dtype.kind == "f"
             ):
-                # TODO: I think can be removed; covered by
-                # elif isinstance(dtype, (cudf.StructDtype, cudf.ListDtype)):
-                # above
-                return cudf.core.column.ListColumn.from_sequences(arbitrary)
-            elif isinstance(arbitrary, abc.Iterable) or isinstance(
-                arbitrary, abc.Sequence
-            ):
-                data = as_column(
-                    _construct_array(arbitrary, dtype),
-                    dtype=dtype,
-                    nan_as_null=nan_as_null,
-                )
-            else:
-                raise e
-    return data
-
-
-def _construct_array(
-    arbitrary: Any, dtype: Optional[Dtype]
-) -> Union[np.ndarray, cupy.ndarray, pd.api.extensions.ExtensionArray]:
-    """
-    Construct a CuPy/NumPy/Pandas array from `arbitrary`
-    """
-    try:
-        dtype = dtype if dtype is None else cudf.dtype(dtype)
-        arbitrary = cupy.asarray(arbitrary, dtype=dtype)
-    except (TypeError, ValueError):
-        native_dtype = dtype
-        inferred_dtype = infer_dtype(arbitrary, skipna=False)
-        if (
-            dtype is None
-            and not cudf._lib.scalar._is_null_host_scalar(arbitrary)
-            and inferred_dtype
-            in (
-                "mixed",
-                "mixed-integer",
-            )
-        ):
-            native_dtype = "object"
-        if inferred_dtype == "interval":
-            # Only way to construct an Interval column.
-            return pd.array(arbitrary)
-        elif (
-            inferred_dtype == "string" and getattr(dtype, "kind", None) == "M"
-        ):
-            # We may have date-like strings with timezones
-            try:
-                with warnings.catch_warnings():
-                    # Need to ignore userwarnings when
-                    # datetime format cannot be inferred.
-                    warnings.simplefilter("ignore", UserWarning)
-                    pd_arbitrary = pd.to_datetime(arbitrary)
-                if isinstance(pd_arbitrary.dtype, pd.DatetimeTZDtype):
-                    raise NotImplementedError(
-                        "cuDF does not yet support timezone-aware datetimes"
-                    )
-                return pd_arbitrary.to_numpy()
-            except pd.errors.OutOfBoundsDatetime:
-                # https://github.com/pandas-dev/pandas/issues/55096
-                pass
-
-        arbitrary = np.asarray(
-            arbitrary,
-            dtype=native_dtype
-            if native_dtype is None
-            else np.dtype(native_dtype),
-        )
-    return arbitrary
+                dtype = _maybe_convert_to_default_type("float")
+        return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)
 
 
 def _mask_from_cuda_array_interface_desc(obj) -> Union[Buffer, None]:
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 0a7435bd241..bbe496333cd 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1714,7 +1714,15 @@ def __init__(
             raise TypeError("dtype must be a datetime type")
 
         name = _setdefault_name(data, name=name)["name"]
-        data = column.as_column(data, dtype=dtype)
+        data = column.as_column(data)
+
+        # TODO: Remove this if statement and fix tests now that
+        # there's timezone support
+        if isinstance(data.dtype, pd.DatetimeTZDtype):
+            raise NotImplementedError(
+                "cuDF does not yet support timezone-aware datetimes"
+            )
+        data = data.astype(dtype)
 
         if copy:
             data = data.copy()
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 438f3e35ec8..5d0c403daa2 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -1710,12 +1710,17 @@ def test_scalar_null_binops(op, dtype_l, dtype_r):
     ],
 )
 @pytest.mark.parametrize(
-    "dtype",
-    ["datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]"],
+    "dtype, components",
+    [
+        ["datetime64[ns]", "00.012345678"],
+        ["datetime64[us]", "00.012345"],
+        ["datetime64[ms]", "00.012"],
+        ["datetime64[s]", "00"],
+    ],
 )
 @pytest.mark.parametrize("op", [operator.add, operator.sub])
 def test_datetime_dateoffset_binaryop(
-    request, n_periods, frequency, dtype, op
+    request, n_periods, frequency, dtype, components, op
 ):
     request.applymarker(
         pytest.mark.xfail(
@@ -1728,9 +1733,9 @@ def test_datetime_dateoffset_binaryop(
     )
 
     date_col = [
-        "2000-01-01 00:00:00.012345678",
-        "2000-01-31 00:00:00.012345678",
-        "2000-02-29 00:00:00.012345678",
+        f"2000-01-01 00:00:{components}",
+        f"2000-01-31 00:00:{components}",
+        f"2000-02-29 00:00:{components}",
     ]
     gsr = cudf.Series(date_col, dtype=dtype)
     psr = gsr.to_pandas()
@@ -1807,14 +1812,21 @@ def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op):
     ],
 )
 @pytest.mark.parametrize(
-    "dtype",
-    ["datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]"],
+    "dtype, components",
+    [
+        ["datetime64[ns]", "00.012345678"],
+        ["datetime64[us]", "00.012345"],
+        ["datetime64[ms]", "00.012"],
+        ["datetime64[s]", "00"],
+    ],
 )
-def test_datetime_dateoffset_binaryop_reflected(n_periods, frequency, dtype):
+def test_datetime_dateoffset_binaryop_reflected(
+    n_periods, frequency, dtype, components
+):
     date_col = [
-        "2000-01-01 00:00:00.012345678",
-        "2000-01-31 00:00:00.012345678",
-        "2000-02-29 00:00:00.012345678",
+        f"2000-01-01 00:00:{components}",
+        f"2000-01-31 00:00:{components}",
+        f"2000-02-29 00:00:{components}",
     ]
     gsr = cudf.Series(date_col, dtype=dtype)
     psr = gsr.to_pandas()  # converts to nanos
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index dace8009041..a8a297c155f 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -177,7 +177,7 @@ def test_column_series_multi_dim(data):
     ("data", "error"),
     [
         ([1, "1.0", "2", -3], cudf.errors.MixedTypeError),
-        ([np.nan, 0, "null", cp.nan], pa.lib.ArrowInvalid),
+        ([np.nan, 0, "null", cp.nan], cudf.errors.MixedTypeError),
         (
             [np.int32(4), np.float64(1.5), np.float32(1.290994), np.int8(0)],
             None,
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 37ba7acf044..46a0dcd315d 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2191,9 +2191,8 @@ def test_datetime_index_freq_error(data, dtype, freq):
 
 
 def test_strings_with_utc_offset_not_implemented():
-    with pytest.warns(DeprecationWarning, match="parsing timezone"):  # cupy
-        with pytest.raises(NotImplementedError):
-            DatetimeIndex(["2022-07-22 00:00:00+02:00"])
+    with pytest.raises(NotImplementedError):
+        DatetimeIndex(["2022-07-22 00:00:00+02:00"])
 
 
 @pytest.mark.parametrize("code", ["z", "Z"])
@@ -2227,78 +2226,116 @@ def test_args_not_datetime_typerror(arg):
 
 
 @pytest.mark.parametrize(
-    "data",
+    "data, dtype",
     [
         [
-            "2000-01-01 00:00:00.000000000",
-            "2000-01-01 00:00:00.000000000",
-            "2000-01-01 00:00:00.000000000",
+            [
+                "2000-01-01 00:00:00.000000000",
+                "2000-01-01 00:00:00.000000000",
+                "2000-01-01 00:00:00.000000000",
+            ],
+            "datetime64[s]",
         ],
         [
-            "2000-01-01 00:00:00.000000000",
-            None,
-            "2000-01-01 00:00:00.000000000",
+            [
+                "2000-01-01 00:00:00.000000000",
+                None,
+                "2000-01-01 00:00:00.000000000",
+            ],
+            "datetime64[s]",
         ],
         [
-            "2000-01-01 00:00:00.001000000",
-            "2000-01-01 00:00:00.000000000",
-            "2000-01-01 00:00:00.000000000",
+            [
+                "2000-01-01 00:00:00.001000000",
+                "2000-01-01 00:00:00.000000000",
+                "2000-01-01 00:00:00.000000000",
+            ],
+            "datetime64[us]",
         ],
         [
-            "2000-01-01 00:00:00.010000000",
-            "2000-01-01 00:00:00.020000000",
-            "2000-01-01 00:00:00.030000000",
+            [
+                "2000-01-01 00:00:00.010000000",
+                "2000-01-01 00:00:00.020000000",
+                "2000-01-01 00:00:00.030000000",
+            ],
+            "datetime64[ms]",
         ],
         [
-            "2000-01-01 00:00:00.010000000",
-            "2000-01-01 00:00:00.020000000",
-            None,
+            [
+                "2000-01-01 00:00:00.010000000",
+                "2000-01-01 00:00:00.020000000",
+                None,
+            ],
+            "datetime64[ms]",
         ],
         [
-            "2000-01-01 00:00:00.000001000",
-            "2000-01-01 00:00:00.000000000",
-            "2000-01-01 00:00:00.000004000",
+            [
+                "2000-01-01 00:00:00.000001000",
+                "2000-01-01 00:00:00.000000000",
+                "2000-01-01 00:00:00.000004000",
+            ],
+            "datetime64[us]",
         ],
         [
-            None,
-            "2000-01-01 00:00:00.000000000",
-            "2000-01-01 00:00:00.000004000",
+            [
+                None,
+                "2000-01-01 00:00:00.000000000",
+                "2000-01-01 00:00:00.000004000",
+            ],
+            "datetime64[us]",
         ],
         [
-            "2000-01-01 00:00:00.000000010",
-            "2000-01-01 00:00:00.000000002",
-            "2000-01-01 00:00:00.000000000",
+            [
+                "2000-01-01 00:00:00.000000010",
+                "2000-01-01 00:00:00.000000002",
+                "2000-01-01 00:00:00.000000000",
+            ],
+            "datetime64[ns]",
         ],
         [
-            "2000-01-01 00:00:00.000000010",
-            None,
-            "2000-01-01 00:00:00.000000000",
+            [
+                "2000-01-01 00:00:00.000000010",
+                None,
+                "2000-01-01 00:00:00.000000000",
+            ],
+            "datetime64[ns]",
         ],
         [
-            "2000-01-01 00:00:01.000000000",
-            "2000-01-01 00:00:40.000000000",
-            "2000-01-01 00:00:59.000000000",
+            [
+                "2000-01-01 00:00:01.000000000",
+                "2000-01-01 00:00:40.000000000",
+                "2000-01-01 00:00:59.000000000",
+            ],
+            "datetime64[s]",
         ],
         [
-            "2000-01-01 00:10:00.000000000",
-            "2000-01-01 00:30:40.000000000",
-            "2000-01-01 00:59:00.000000000",
+            [
+                "2000-01-01 00:10:00.000000000",
+                "2000-01-01 00:30:40.000000000",
+                "2000-01-01 00:59:00.000000000",
+            ],
+            "datetime64[s]",
         ],
         [
-            "2000-01-01 07:00:00.000000000",
-            "2000-01-01 08:00:00.000000000",
-            None,
+            [
+                "2000-01-01 07:00:00.000000000",
+                "2000-01-01 08:00:00.000000000",
+                None,
+            ],
+            "datetime64[s]",
         ],
-        [None, None, None],
-        [],
+        [[None, None, None], "datetime64[s]"],
+        [[], "datetime64[s]"],
         [
-            "2000-01-01 00:10:00.123456789",
-            "2000-01-01 00:30:40.123123456",
-            "2000-01-01 00:59:00.675347634",
+            [
+                "2000-01-01 00:10:00.123456789",
+                "2000-01-01 00:30:40.123123456",
+                "2000-01-01 00:59:00.675347634",
+            ],
+            "datetime64[ns]",
         ],
     ],
 )
-@pytest.mark.parametrize("dtype", DATETIME_TYPES)
 def test_datetime_to_str(data, dtype):
     gs = cudf.Series(data, dtype=dtype)
     ps = gs.to_pandas()
@@ -2311,6 +2348,15 @@ def test_datetime_to_str(data, dtype):
     assert_eq(actual.to_pandas(nullable=True), expected)
 
 
+def test_datetime_string_to_datetime_resolution_loss_raises():
+    data = ["2020-01-01 00:00:00.00001"]
+    dtype = "datetime64[s]"
+    with pytest.raises(ValueError):
+        cudf.Series(data, dtype=dtype)
+    with pytest.raises(ValueError):
+        pd.Series(data, dtype=dtype)
+
+
 def test_dateimeindex_from_noniso_string():
     data = ["20160920", "20160925"]
     gdti = cudf.DatetimeIndex(data)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index b45857e28ad..642dbde3790 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2218,7 +2218,7 @@ def __getitem__(self, key):
 
 
 def test_series_constructor_error_mixed_type():
-    with pytest.raises(pa.ArrowTypeError):
+    with pytest.raises(MixedTypeError):
         cudf.Series(["abc", np.nan, "123"], nan_as_null=False)
 
 
@@ -2537,7 +2537,7 @@ def test_nan_as_null_from_arrow_objects(klass, data):
 @pytest.mark.parametrize("reso", ["M", "ps"])
 @pytest.mark.parametrize("typ", ["M", "m"])
 def test_series_invalid_reso_dtype(reso, typ):
-    with pytest.raises(NotImplementedError):
+    with pytest.raises(TypeError):
         cudf.Series([], dtype=f"{typ}8[{reso}]")
 
 
diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py
index 4136d97d69f..336b92dba4f 100644
--- a/python/cudf/cudf/utils/docutils.py
+++ b/python/cudf/cudf/utils/docutils.py
@@ -210,12 +210,11 @@ def wrapper(func):
 
         Describing a timestamp ``Series``.
 
-        >>> import numpy as np
         >>> s = cudf.Series([
-        ...   np.datetime64("2000-01-01"),
-        ...   np.datetime64("2010-01-01"),
-        ...   np.datetime64("2010-01-01")
-        ... ])
+        ...   "2000-01-01",
+        ...   "2010-01-01",
+        ...   "2010-01-01"
+        ... ], dtype="datetime64[s]")
         >>> s
         0   2000-01-01
         1   2010-01-01

From 9f2fdf84f59d8093d4ec7b91932c6b17a8193fd7 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 17 Apr 2024 15:25:11 -0500
Subject: [PATCH 089/272] Upgrade upper bound pinning to `pandas-2.2.2`
 (#15554)

This PR upgrades the pandas upper bound pinning to allow installation of newly released `2.2.2` version.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - Matthew Roeschke (https://github.com/mroeschke)
  - Bradley Dice (https://github.com/bdice)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cudf/pull/15554
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-122_arch-x86_64.yaml | 2 +-
 conda/recipes/cudf/meta.yaml                     | 2 +-
 dependencies.yaml                                | 2 +-
 python/cudf/cudf/core/_compat.py                 | 2 +-
 python/cudf/pyproject.toml                       | 2 +-
 python/dask_cudf/pyproject.toml                  | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index e629f8b633e..ef971d10f19 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -64,7 +64,7 @@ dependencies:
 - nvcomp==3.0.6
 - nvtx>=0.2.1
 - packaging
-- pandas>=2.0,<2.2.2dev0
+- pandas>=2.0,<2.2.3dev0
 - pandoc
 - pip
 - pre-commit
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index f135a88cac2..688e41ec1ba 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -62,7 +62,7 @@ dependencies:
 - nvcomp==3.0.6
 - nvtx>=0.2.1
 - packaging
-- pandas>=2.0,<2.2.2dev0
+- pandas>=2.0,<2.2.3dev0
 - pandoc
 - pip
 - pre-commit
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index cd9237bd7cb..5512ef11057 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -81,7 +81,7 @@ requirements:
     - protobuf >=3.20,<5.0a0
     - python
     - typing_extensions >=4.0.0
-    - pandas >=2.0,<2.2.2dev0
+    - pandas >=2.0,<2.2.3dev0
     - cupy >=12.0.0
     - numba >=0.57
     - {{ pin_compatible('numpy', max_pin='x') }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 8cd4c798c38..147a89076c4 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -514,7 +514,7 @@ dependencies:
         packages:
           - fsspec>=0.6.0
           - numpy>=1.23,<2.0a0
-          - pandas>=2.0,<2.2.2dev0
+          - pandas>=2.0,<2.2.3dev0
   run_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py
index fba3a98e56d..e2bdecbe67a 100644
--- a/python/cudf/cudf/core/_compat.py
+++ b/python/cudf/cudf/core/_compat.py
@@ -3,7 +3,7 @@
 import pandas as pd
 from packaging import version
 
-PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.2.1")
+PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.2.2")
 PANDAS_VERSION = version.parse(pd.__version__)
 
 
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 434383bc208..adab199dcf4 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -33,7 +33,7 @@ dependencies = [
     "numpy>=1.23,<2.0a0",
     "nvtx>=0.2.1",
     "packaging",
-    "pandas>=2.0,<2.2.2dev0",
+    "pandas>=2.0,<2.2.3dev0",
     "protobuf>=3.20,<5",
     "ptxcompiler",
     "pyarrow>=14.0.1,<15.0.0a0",
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index d0743516c4d..fcf83e82989 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -22,7 +22,7 @@ dependencies = [
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "numpy>=1.23,<2.0a0",
-    "pandas>=2.0,<2.2.2dev0",
+    "pandas>=2.0,<2.2.3dev0",
     "rapids-dask-dependency==24.6.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [

From f222b4adc78187539092ad14de9d407451975514 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 17 Apr 2024 15:44:22 -0700
Subject: [PATCH 090/272] Bind `read_parquet_metadata` API to libcudf instead
 of pyarrow and extract `RowGroup` information (#15398)

The `cudf.io.read_parquet_metadata` is now bound to corresponding libcudf API instead of relying on pyarrow. The libcudf API now also returns high level `RowGroup` metadata to solve #11214. Added additional tests and doc updates as well.

More metadata information such `min, max` values for each column in each row group can also be extracted and returned if needed. Thoughts?

Recommend: Closing #15320 without merging in favor of this PR.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15398
---
 cpp/include/cudf/io/parquet_metadata.hpp      | 41 ++++++++++-
 cpp/src/io/parquet/reader_impl.cpp            |  3 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp    | 20 ++++++
 cpp/src/io/parquet/reader_impl_helpers.hpp    |  8 +++
 .../cudf/_lib/cpp/io/parquet_metadata.pxd     | 32 +++++++++
 python/cudf/cudf/_lib/parquet.pyx             | 69 +++++++++++++++++++
 python/cudf/cudf/io/parquet.py                | 42 +++++++++--
 python/cudf/cudf/tests/test_parquet.py        | 47 +++++++++++--
 python/cudf/cudf/utils/ioutils.py             |  4 +-
 9 files changed, 249 insertions(+), 17 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/cpp/io/parquet_metadata.pxd

diff --git a/cpp/include/cudf/io/parquet_metadata.hpp b/cpp/include/cudf/io/parquet_metadata.hpp
index 3149b5b5945..e0c406c180c 100644
--- a/cpp/include/cudf/io/parquet_metadata.hpp
+++ b/cpp/include/cudf/io/parquet_metadata.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,6 +59,13 @@ enum class TypeKind : int8_t {
  */
 struct parquet_column_schema {
  public:
+  /**
+   * @brief Default constructor.
+   *
+   * This has been added since Cython requires a default constructor to create objects on stack.
+   */
+  explicit parquet_column_schema() = default;
+
   /**
    * @brief constructor
    *
@@ -134,6 +141,13 @@ struct parquet_column_schema {
  */
 struct parquet_schema {
  public:
+  /**
+   * @brief Default constructor.
+   *
+   * This has been added since Cython requires a default constructor to create objects on stack.
+   */
+  explicit parquet_schema() = default;
+
   /**
    * @brief constructor
    *
@@ -165,6 +179,15 @@ class parquet_metadata {
  public:
   /// Key-value metadata in the file footer.
   using key_value_metadata = std::unordered_map<std::string, std::string>;
+  /// row group metadata from each RowGroup element.
+  using row_group_metadata = std::unordered_map<std::string, int64_t>;
+
+  /**
+   * @brief Default constructor.
+   *
+   * This has been added since Cython requires a default constructor to create objects on stack.
+   */
+  explicit parquet_metadata() = default;
 
   /**
    * @brief constructor
@@ -173,15 +196,18 @@ class parquet_metadata {
    * @param num_rows number of rows
    * @param num_rowgroups number of row groups
    * @param file_metadata key-value metadata in the file footer
+   * @param rg_metadata vector of maps containing metadata for each row group
    */
   parquet_metadata(parquet_schema schema,
                    int64_t num_rows,
                    size_type num_rowgroups,
-                   key_value_metadata file_metadata)
+                   key_value_metadata file_metadata,
+                   std::vector<row_group_metadata> rg_metadata)
     : _schema{std::move(schema)},
       _num_rows{num_rows},
       _num_rowgroups{num_rowgroups},
-      _file_metadata{std::move(file_metadata)}
+      _file_metadata{std::move(file_metadata)},
+      _rowgroup_metadata{std::move(rg_metadata)}
   {
   }
 
@@ -207,6 +233,7 @@ class parquet_metadata {
    * @return Number of row groups
    */
   [[nodiscard]] auto num_rowgroups() const { return _num_rowgroups; }
+
   /**
    * @brief Returns the Key value metadata in the file footer.
    *
@@ -214,11 +241,19 @@ class parquet_metadata {
    */
   [[nodiscard]] auto const& metadata() const { return _file_metadata; }
 
+  /**
+   * @brief Returns the row group metadata in the file footer.
+   *
+   * @return vector of row group metadata as maps
+   */
+  [[nodiscard]] auto const& rowgroup_metadata() const { return _rowgroup_metadata; }
+
  private:
   parquet_schema _schema;
   int64_t _num_rows;
   size_type _num_rowgroups;
   key_value_metadata _file_metadata;
+  std::vector<row_group_metadata> _rowgroup_metadata;
 };
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index e7409f45e13..a524e7c6dcc 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -609,7 +609,8 @@ parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> con
   return parquet_metadata{parquet_schema{walk_schema(&metadata, 0)},
                           metadata.get_num_rows(),
                           metadata.get_num_row_groups(),
-                          metadata.get_key_value_metadata()[0]};
+                          metadata.get_key_value_metadata()[0],
+                          metadata.get_rowgroup_metadata()};
 }
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index bfc69264ab2..402ccef7a15 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -560,6 +560,26 @@ ColumnChunkMetaData const& aggregate_reader_metadata::get_column_metadata(size_t
   return col->meta_data;
 }
 
+std::vector<std::unordered_map<std::string, int64_t>>
+aggregate_reader_metadata::get_rowgroup_metadata() const
+{
+  std::vector<std::unordered_map<std::string, int64_t>> rg_metadata;
+
+  std::for_each(
+    per_file_metadata.cbegin(), per_file_metadata.cend(), [&rg_metadata](auto const& pfm) {
+      std::transform(pfm.row_groups.cbegin(),
+                     pfm.row_groups.cend(),
+                     std::back_inserter(rg_metadata),
+                     [](auto const& rg) {
+                       std::unordered_map<std::string, int64_t> rg_meta_map;
+                       rg_meta_map["num_rows"]        = rg.num_rows;
+                       rg_meta_map["total_byte_size"] = rg.total_byte_size;
+                       return rg_meta_map;
+                     });
+    });
+  return rg_metadata;
+}
+
 std::string aggregate_reader_metadata::get_pandas_index() const
 {
   // Assumes that all input files have the same metadata
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 8295654764e..09f65f9c388 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -166,6 +166,13 @@ class aggregate_reader_metadata {
                                                                size_type src_idx,
                                                                int schema_idx) const;
 
+  /**
+   * @brief Extracts high-level metadata for all row groups
+   *
+   * @return List of maps containing metadata information for each row group
+   */
+  [[nodiscard]] std::vector<std::unordered_map<std::string, int64_t>> get_rowgroup_metadata() const;
+
   [[nodiscard]] auto get_num_rows() const { return num_rows; }
 
   [[nodiscard]] auto get_num_row_groups() const { return num_row_groups; }
@@ -178,6 +185,7 @@ class aggregate_reader_metadata {
   [[nodiscard]] auto const& get_key_value_metadata() const& { return keyval_maps; }
 
   [[nodiscard]] auto&& get_key_value_metadata() && { return std::move(keyval_maps); }
+
   /**
    * @brief Gets the concrete nesting depth of output cudf columns
    *
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet_metadata.pxd b/python/cudf/cudf/_lib/cpp/io/parquet_metadata.pxd
new file mode 100644
index 00000000000..e9def2aea5d
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/io/parquet_metadata.pxd
@@ -0,0 +1,32 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libc.stdint cimport int64_t
+from libcpp.string cimport string
+from libcpp.unordered_map cimport unordered_map
+from libcpp.vector cimport vector
+
+cimport cudf._lib.cpp.io.types as cudf_io_types
+from cudf._lib.cpp.types cimport size_type
+
+
+cdef extern from "cudf/io/parquet_metadata.hpp" namespace "cudf::io" nogil:
+    cdef cppclass parquet_column_schema:
+        parquet_column_schema() except+
+        string name() except+
+        size_type num_children() except+
+        parquet_column_schema child(int idx) except+
+        vector[parquet_column_schema] children() except+
+
+    cdef cppclass parquet_schema:
+        parquet_schema() except+
+        parquet_column_schema root() except+
+
+    cdef cppclass parquet_metadata:
+        parquet_metadata() except+
+        parquet_schema schema() except+
+        int64_t num_rows() except+
+        size_type num_rowgroups() except+
+        unordered_map[string, string] metadata() except+
+        vector[unordered_map[string, int64_t]] rowgroup_metadata() except+
+
+    cdef parquet_metadata read_parquet_metadata(cudf_io_types.source_info src) except+
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index ce1cba59bec..9ce9aad18f7 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -46,6 +46,10 @@ from cudf._lib.cpp.io.parquet cimport (
     read_parquet as parquet_reader,
     write_parquet as parquet_writer,
 )
+from cudf._lib.cpp.io.parquet_metadata cimport (
+    parquet_metadata,
+    read_parquet_metadata as parquet_metadata_reader,
+)
 from cudf._lib.cpp.io.types cimport column_in_metadata, table_input_metadata
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type
@@ -316,6 +320,71 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
         df._data.label_dtype = cudf.dtype(column_index_type)
     return df
 
+cpdef read_parquet_metadata(filepaths_or_buffers):
+    """
+    Cython function to call into libcudf API, see `read_parquet_metadata`.
+
+    See Also
+    --------
+    cudf.io.parquet.read_parquet
+    cudf.io.parquet.to_parquet
+    """
+    # Convert NativeFile buffers to NativeFileDatasource
+    for i, datasource in enumerate(filepaths_or_buffers):
+        if isinstance(datasource, NativeFile):
+            filepaths_or_buffers[i] = NativeFileDatasource(datasource)
+
+    cdef cudf_io_types.source_info source = make_source_info(filepaths_or_buffers)
+
+    args = move(source)
+
+    cdef parquet_metadata c_result
+
+    # Read Parquet metadata
+    with nogil:
+        c_result = move(parquet_metadata_reader(args))
+
+    # access and return results
+    num_rows = c_result.num_rows()
+    num_rowgroups = c_result.num_rowgroups()
+
+    # extract row group metadata and sanitize keys
+    row_group_metadata = [{k.decode(): v for k, v in metadata}
+                          for metadata in c_result.rowgroup_metadata()]
+
+    # read all column names including index column, if any
+    col_names = [info.name().decode() for info in c_result.schema().root().children()]
+
+    # access the Parquet file_footer to find the index
+    index_col = None
+    cdef unordered_map[string, string] file_footer = c_result.metadata()
+
+    # get index column name(s)
+    index_col_names = None
+    json_str = file_footer[b'pandas'].decode('utf-8')
+    meta = None
+    if json_str != "":
+        meta = json.loads(json_str)
+        file_is_range_index, index_col, _ = _parse_metadata(meta)
+        if not file_is_range_index and index_col is not None \
+                and index_col_names is None:
+            index_col_names = {}
+            for idx_col in index_col:
+                for c in meta['columns']:
+                    if c['field_name'] == idx_col:
+                        index_col_names[idx_col] = c['name']
+
+    # remove the index column from the list of column names
+    # only if index_col_names is not None
+    if index_col_names is not None:
+        col_names = [name for name in col_names if name not in index_col_names]
+
+    # num_columns = length of list(col_names)
+    num_columns = len(col_names)
+
+    # return the metadata
+    return num_rows, num_rowgroups, col_names, num_columns, row_group_metadata
+
 
 @acquire_spill_lock()
 def write_parquet(
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index e55898de675..e7f1ad0751f 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -267,17 +267,45 @@ def write_to_dataset(
 
 @ioutils.doc_read_parquet_metadata()
 @_cudf_nvtx_annotate
-def read_parquet_metadata(path):
+def read_parquet_metadata(filepath_or_buffer):
     """{docstring}"""
-    import pyarrow.parquet as pq
+    # Multiple sources are passed as a list. If a single source is passed,
+    # wrap it in a list for unified processing downstream.
+    if not is_list_like(filepath_or_buffer):
+        filepath_or_buffer = [filepath_or_buffer]
 
-    pq_file = pq.ParquetFile(path)
+    # Start by trying to construct a filesystem object
+    fs, paths = ioutils._get_filesystem_and_paths(
+        path_or_data=filepath_or_buffer, storage_options=None
+    )
 
-    num_rows = pq_file.metadata.num_rows
-    num_row_groups = pq_file.num_row_groups
-    col_names = pq_file.schema.names
+    # Check if filepath or buffer
+    filepath_or_buffer = paths if paths else filepath_or_buffer
+
+    # List of filepaths or buffers
+    filepaths_or_buffers = []
+
+    for source in filepath_or_buffer:
+        tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
+            path_or_data=source,
+            compression=None,
+            fs=fs,
+            use_python_file_object=True,
+            open_file_options=None,
+            storage_options=None,
+            bytes_per_thread=None,
+        )
+
+        if compression is not None:
+            raise ValueError(
+                "URL content-encoding decompression is not supported"
+            )
+        if isinstance(tmp_source, list):
+            filepath_or_buffer.extend(tmp_source)
+        else:
+            filepaths_or_buffers.append(tmp_source)
 
-    return num_rows, num_row_groups, col_names
+    return libparquet.read_parquet_metadata(filepaths_or_buffers)
 
 
 @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 9ba71b28637..56a4281aad9 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -415,8 +415,15 @@ def num_row_groups(rows, group_size):
     row_group_size = 5
     pdf.to_parquet(fname, compression="snappy", row_group_size=row_group_size)
 
-    num_rows, row_groups, col_names = cudf.io.read_parquet_metadata(fname)
+    (
+        num_rows,
+        row_groups,
+        col_names,
+        num_columns,
+        _,  # rowgroup_metadata
+    ) = cudf.io.read_parquet_metadata(fname)
 
+    assert num_columns == len(pdf.columns)
     assert num_rows == len(pdf.index)
     assert row_groups == num_row_groups(num_rows, row_group_size)
     for a, b in zip(col_names, pdf.columns):
@@ -561,7 +568,9 @@ def test_parquet_read_row_groups(tmpdir, pdf, row_group_size):
     fname = tmpdir.join("row_group.parquet")
     pdf.to_parquet(fname, compression="gzip", row_group_size=row_group_size)
 
-    num_rows, row_groups, col_names = cudf.io.read_parquet_metadata(fname)
+    num_rows, row_groups, col_names, _, _ = cudf.io.read_parquet_metadata(
+        fname
+    )
 
     gdf = [cudf.read_parquet(fname, row_groups=[i]) for i in range(row_groups)]
     gdf = cudf.concat(gdf)
@@ -586,7 +595,9 @@ def test_parquet_read_row_groups_non_contiguous(tmpdir, pdf, row_group_size):
     fname = tmpdir.join("row_group.parquet")
     pdf.to_parquet(fname, compression="gzip", row_group_size=row_group_size)
 
-    num_rows, row_groups, col_names = cudf.io.read_parquet_metadata(fname)
+    num_rows, row_groups, col_names, _, _ = cudf.io.read_parquet_metadata(
+        fname
+    )
 
     # alternate rows between the two sources
     gdf = cudf.read_parquet(
@@ -1803,7 +1814,9 @@ def test_parquet_writer_row_group_size(tmpdir, row_group_size_kwargs):
         writer.write_table(gdf)
 
     # Simple check for multiple row-groups
-    nrows, nrow_groups, columns = cudf.io.parquet.read_parquet_metadata(fname)
+    nrows, nrow_groups, columns, _, _ = cudf.io.parquet.read_parquet_metadata(
+        fname
+    )
     assert nrows == size
     assert nrow_groups > 1
     assert columns == ["a", "b"]
@@ -2853,7 +2866,9 @@ def test_to_parquet_row_group_size(
         fname, row_group_size_bytes=size_bytes, row_group_size_rows=size_rows
     )
 
-    num_rows, row_groups, col_names = cudf.io.read_parquet_metadata(fname)
+    num_rows, row_groups, col_names, _, _ = cudf.io.read_parquet_metadata(
+        fname
+    )
     # 8 bytes per row, as the column is int64
     expected_num_rows = max(
         math.ceil(num_rows / size_rows), math.ceil(8 * num_rows / size_bytes)
@@ -2861,6 +2876,28 @@ def test_to_parquet_row_group_size(
     assert expected_num_rows == row_groups
 
 
+@pytest.mark.parametrize("size_rows", [500_000, 100_000, 10_000])
+def test_parquet_row_group_metadata(tmpdir, large_int64_gdf, size_rows):
+    fname = tmpdir.join("row_group_size.parquet")
+    large_int64_gdf.to_parquet(fname, row_group_size_rows=size_rows)
+
+    # read file metadata from parquet
+    (
+        num_rows,
+        row_groups,
+        _,  # col_names
+        _,  # num_columns
+        row_group_metadata,
+    ) = cudf.io.read_parquet_metadata(fname)
+
+    # length(RowGroupsMetaData) == number of row groups
+    assert len(row_group_metadata) == row_groups
+    # sum of rows in row groups == total rows
+    assert num_rows == sum(
+        [row_group["num_rows"] for row_group in row_group_metadata]
+    )
+
+
 def test_parquet_reader_decimal_columns():
     df = cudf.DataFrame(
         {
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 8c58f2b859e..66e14f4b9de 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -101,11 +101,13 @@
 Total number of rows
 Number of row groups
 List of column names
+Number of columns
+List of metadata of row groups
 
 Examples
 --------
 >>> import cudf
->>> num_rows, num_row_groups, names = cudf.io.read_parquet_metadata(filename)
+>>> num_rows, num_row_groups, names, num_columns, row_group_metadata = cudf.io.read_parquet_metadata(filename)
 >>> df = [cudf.read_parquet(fname, row_group=i) for i in range(row_groups)]
 >>> df = cudf.concat(df)
 >>> df

From eaae68d8b099e90a2e3bcc968f98c652d36bb844 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 17 Apr 2024 18:33:41 -0500
Subject: [PATCH 091/272] Deprecate legacy JSON reader options. (#15558)

This PR deprecates the option for using the legacy JSON reader, so it can be removed in the next RAPIDS release.

This work follows up on a task from https://github.com/rapidsai/cudf/issues/15537

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15558
---
 cpp/include/cudf/io/json.hpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index d8330b78f0e..a6112b8db4c 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -251,9 +251,11 @@ class json_reader_options {
   /**
    * @brief Whether the legacy reader should be used.
    *
+   * @deprecated Since 24.06
+   *
    * @returns true if the legacy reader will be used, false otherwise
    */
-  bool is_enabled_legacy() const { return _legacy; }
+  [[deprecated]] bool is_enabled_legacy() const { return _legacy; }
 
   /**
    * @brief Whether the reader should keep quotes of string values.
@@ -350,9 +352,11 @@ class json_reader_options {
   /**
    * @brief Set whether to use the legacy reader.
    *
+   * @deprecated Since 24.06
+   *
    * @param val Boolean value to enable/disable the legacy reader
    */
-  void enable_legacy(bool val) { _legacy = val; }
+  [[deprecated]] void enable_legacy(bool val) { _legacy = val; }
 
   /**
    * @brief Set whether the reader should keep quotes of string values.
@@ -519,10 +523,12 @@ class json_reader_options_builder {
   /**
    * @brief Set whether to use the legacy reader.
    *
+   * @deprecated Since 24.06
+   *
    * @param val Boolean value to enable/disable legacy parsing
    * @return this for chaining
    */
-  json_reader_options_builder& legacy(bool val)
+  [[deprecated]] json_reader_options_builder& legacy(bool val)
   {
     options._legacy = val;
     return *this;

From 0935d389192824ac1c9ea3e79df01db3b33feaef Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 18 Apr 2024 05:05:10 -1000
Subject: [PATCH 092/272] Fix millisecond resampling in cudf Python (#15560)

closes #15551

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - https://github.com/brandon-b-miller
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15560
---
 python/cudf/cudf/core/resample.py         | 44 +++++++++++------------
 python/cudf/cudf/core/tools/datetimes.py  | 15 --------
 python/cudf/cudf/tests/test_resampling.py | 14 ++++++++
 3 files changed, 35 insertions(+), 38 deletions(-)

diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
index 1a79b122561..cdd4ec6f8e5 100644
--- a/python/cudf/cudf/core/resample.py
+++ b/python/cudf/cudf/core/resample.py
@@ -30,7 +30,6 @@
     SeriesGroupBy,
     _Grouping,
 )
-from cudf.core.tools.datetimes import _offset_alias_to_code, _unit_dtype_map
 
 
 class _Resampler(GroupBy):
@@ -247,47 +246,46 @@ def _handle_frequency_grouper(self, by):
         # column to have the same dtype, so we compute a `result_type`
         # and cast them both to that type.
         try:
-            result_type = np.dtype(
-                _unit_dtype_map[_offset_alias_to_code[offset.name]]
-            )
-        except KeyError:
+            result_type = np.dtype(f"datetime64[{offset.rule_code}]")
+            # TODO: Ideally, we can avoid one cast by having `date_range`
+            # generate timestamps of a given dtype.  Currently, it can
+            # only generate timestamps with 'ns' precision
+            cast_key_column = key_column.astype(result_type)
+            cast_bin_labels = bin_labels.astype(result_type)
+        except TypeError:
             # unsupported resolution (we don't support resolutions >s)
             # fall back to using datetime64[s]
             result_type = np.dtype("datetime64[s]")
-
-        # TODO: Ideally, we can avoid one cast by having `date_range`
-        # generate timestamps of a given dtype.  Currently, it can
-        # only generate timestamps with 'ns' precision
-        key_column = key_column.astype(result_type)
-        bin_labels = bin_labels.astype(result_type)
+            cast_key_column = key_column.astype(result_type)
+            cast_bin_labels = bin_labels.astype(result_type)
 
         # bin the key column:
         bin_numbers = cudf._lib.labeling.label_bins(
-            key_column,
-            left_edges=bin_labels[:-1]._column,
+            cast_key_column,
+            left_edges=cast_bin_labels[:-1]._column,
             left_inclusive=(closed == "left"),
-            right_edges=bin_labels[1:]._column,
+            right_edges=cast_bin_labels[1:]._column,
             right_inclusive=(closed == "right"),
         )
 
         if label == "right":
-            bin_labels = bin_labels[1:]
+            cast_bin_labels = cast_bin_labels[1:]
         else:
-            bin_labels = bin_labels[:-1]
+            cast_bin_labels = cast_bin_labels[:-1]
 
         # if we have more labels than bins, remove the extras labels:
         nbins = bin_numbers.max() + 1
-        if len(bin_labels) > nbins:
-            bin_labels = bin_labels[:nbins]
+        if len(cast_bin_labels) > nbins:
+            cast_bin_labels = cast_bin_labels[:nbins]
 
-        bin_labels.name = self.names[0]
-        self.bin_labels = bin_labels
+        cast_bin_labels.name = self.names[0]
+        self.bin_labels = cast_bin_labels
 
         # replace self._key_columns with the binned key column:
         self._key_columns = [
-            bin_labels._gather(bin_numbers, check_bounds=False)._column.astype(
-                result_type
-            )
+            cast_bin_labels._gather(
+                bin_numbers, check_bounds=False
+            )._column.astype(result_type)
         ]
 
 
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index ed8fca88acd..907f3b586d1 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -55,21 +55,6 @@
     "D": "datetime64[s]",
 }
 
-_offset_alias_to_code = {
-    "W": "W",
-    "D": "D",
-    "H": "h",
-    "h": "h",
-    "T": "m",
-    "min": "m",
-    "s": "s",
-    "S": "s",
-    "U": "us",
-    "us": "us",
-    "N": "ns",
-    "ns": "ns",
-}
-
 
 def to_datetime(
     arg,
diff --git a/python/cudf/cudf/tests/test_resampling.py b/python/cudf/cudf/tests/test_resampling.py
index ad6e0ac52c5..d7a3fea1273 100644
--- a/python/cudf/cudf/tests/test_resampling.py
+++ b/python/cudf/cudf/tests/test_resampling.py
@@ -162,3 +162,17 @@ def test_resampling_frequency_conversion(in_freq, sampling_freq, out_freq):
     assert_resample_results_equal(expect, got)
 
     assert got.index.dtype == np.dtype(f"datetime64[{out_freq}]")
+
+
+def test_resampling_downsampling_ms():
+    pdf = pd.DataFrame(
+        {
+            "time": pd.date_range("2020-01-01", periods=5, freq="1ns"),
+            "sign": range(5),
+        }
+    )
+    gdf = cudf.from_pandas(pdf)
+    expected = pdf.resample("10ms", on="time").mean()
+    result = gdf.resample("10ms", on="time").mean()
+    result.index = result.index.astype("datetime64[ns]")
+    assert_eq(result, expected, check_freq=False)

From ae9e552697c2b13f0fc8161e088f2abeb83fbf36 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 18 Apr 2024 10:22:07 -0500
Subject: [PATCH 093/272] Use same .clang-format in cuDF JNI (#15557)

Closes #15546.

Currently cuDF JNI uses its own `.clang-format` settings. These settings organize includes differently than the rest of the cuDF C++ codebase, so we would like to align them.

This PR removes the JNI's custom `.clang-format` and instead uses the same settings as the rest of cuDF.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/15557
---
 java/src/main/native/.clang-format            |  204 -
 java/src/main/native/clang-format.README      |   13 -
 java/src/main/native/include/jni_utils.hpp    |  783 ++--
 .../main/native/include/maps_column_view.hpp  |   53 +-
 .../native/src/Aggregation128UtilsJni.cpp     |   14 +-
 java/src/main/native/src/AggregationJni.cpp   |  161 +-
 java/src/main/native/src/ChunkedPackJni.cpp   |   44 +-
 java/src/main/native/src/ChunkedReaderJni.cpp |  105 +-
 java/src/main/native/src/ColumnVectorJni.cpp  |  279 +-
 java/src/main/native/src/ColumnViewJni.cpp    | 1812 +++++----
 java/src/main/native/src/ColumnViewJni.cu     |  214 +-
 java/src/main/native/src/ColumnViewJni.hpp    |   22 +-
 .../main/native/src/CompiledExpression.cpp    |  217 +-
 .../main/native/src/ContiguousTableJni.cpp    |  100 +-
 java/src/main/native/src/CuFileJni.cpp        |  199 +-
 java/src/main/native/src/CudaJni.cpp          |  210 +-
 java/src/main/native/src/CudfJni.cpp          |  101 +-
 .../main/native/src/DataSourceHelperJni.cpp   |  185 +-
 java/src/main/native/src/HashJoinJni.cpp      |   24 +-
 .../src/HostMemoryBufferNativeUtilsJni.cpp    |   41 +-
 java/src/main/native/src/NvcompJni.cpp        |  183 +-
 java/src/main/native/src/NvtxRangeJni.cpp     |   16 +-
 .../main/native/src/NvtxUniqueRangeJni.cpp    |   21 +-
 .../native/src/PackedColumnMetadataJni.cpp    |   19 +-
 java/src/main/native/src/RmmJni.cpp           |  610 +--
 java/src/main/native/src/ScalarJni.cpp        |  353 +-
 java/src/main/native/src/TableJni.cpp         | 3357 ++++++++++-------
 .../main/native/src/aggregation128_utils.cu   |  113 +-
 .../main/native/src/aggregation128_utils.hpp  |   24 +-
 .../native/src/check_nvcomp_output_sizes.cu   |   27 +-
 .../native/src/check_nvcomp_output_sizes.hpp  |   13 +-
 .../main/native/src/csv_chunked_writer.hpp    |   28 +-
 java/src/main/native/src/cudf_jni_apis.hpp    |   61 +-
 java/src/main/native/src/dtype_utils.hpp      |   19 +-
 .../src/main/native/src/jni_compiled_expr.hpp |   35 +-
 .../main/native/src/jni_writer_data_sink.hpp  |  131 +-
 java/src/main/native/src/maps_column_view.cu  |   72 +-
 java/src/main/native/src/nvtx_common.hpp      |    8 +-
 38 files changed, 5533 insertions(+), 4338 deletions(-)
 delete mode 100644 java/src/main/native/.clang-format
 delete mode 100644 java/src/main/native/clang-format.README

diff --git a/java/src/main/native/.clang-format b/java/src/main/native/.clang-format
deleted file mode 100644
index e0866533a36..00000000000
--- a/java/src/main/native/.clang-format
+++ /dev/null
@@ -1,204 +0,0 @@
----
-# Reference: https://clang.llvm.org/docs/ClangFormatStyleOptions.html
-Language:        Cpp
-# BasedOnStyle:  LLVM
-# no indentation (-2 from indent, which is 2)
-AccessModifierOffset: -2
-AlignAfterOpenBracket: Align
-# int aaaa = 12;
-# int b    = 23;
-# int ccc  = 23;
-# leaving OFF
-AlignConsecutiveAssignments: false
-# int         aaaa = 12;
-# float       b = 23;
-# std::string ccc = 23;
-# leaving OFF
-AlignConsecutiveDeclarations: false
-##define A                                                                      \
-#  int aaaa;                                                                    \
-#  int b;                                                                       \
-#  int dddddddddd;
-# leaving ON
-AlignEscapedNewlines: Right
-# int aaa = bbbbbbbbbbbbbbb +
-#           ccccccccccccccc;
-# leaving ON
-AlignOperands:   true
-# true:                                   false:
-# int a;     // My comment a      vs.     int a; // My comment a
-# int b = 2; // comment  b                int b = 2; // comment about b
-# leaving ON
-AlignTrailingComments: true
-# squeezes a long declaration's arguments to the next line:
-#true:
-#void myFunction(
-#	int a, int b, int c, int d, int e);
-#
-#false:
-#void myFunction(int a,
-#				int b,
-#				int c,
-#				int d,
-#				int e);
-# leaving ON
-AllowAllParametersOfDeclarationOnNextLine: true
-# changed to ON, as we use short blocks on same lines
-AllowShortBlocksOnASingleLine: true
-# set this to ON, we use this in a few places
-AllowShortCaseLabelsOnASingleLine: true
-# set this to ON
-AllowShortFunctionsOnASingleLine: Inline
-AllowShortIfStatementsOnASingleLine: false
-AllowShortLoopsOnASingleLine: false
-# Deprecated option.
-# PenaltyReturnTypeOnItsOwnLine applies, as we set this to None,
-# where it tries to break after the return type automatically
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: false
-AlwaysBreakTemplateDeclarations: MultiLine
-
-# if all the arguments for a function don't fit in a single line,
-# with a value of "false", it'll split each argument into different lines
-BinPackArguments: true
-BinPackParameters: true
-
-# if this is set to Custom, the BraceWrapping flags apply
-BreakBeforeBraces: Custom
-BraceWrapping:
-  AfterClass:      false
-  AfterControlStatement: false
-  AfterEnum:       false
-  AfterFunction:   false
-  AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  AfterExternBlock: false
-  BeforeCatch:     false
-  BeforeElse:      false
-  IndentBraces:    false
-  SplitEmptyFunction: false
-  SplitEmptyRecord: false
-  SplitEmptyNamespace: false
-
-# will break after operators when a line is too long
-BreakBeforeBinaryOperators: None
-# not in docs.. so that's nice
-BreakBeforeInheritanceComma: false
-# This will break inheritance list and align on colon,
-# it also places each inherited class in a different line.
-# Leaving ON
-BreakInheritanceList: BeforeColon
-
-#
-#true:
-#veryVeryVeryVeryVeryVeryVeryVeryVeryVeryVeryLongDescription
-#	? firstValue
-#	: SecondValueVeryVeryVeryVeryLong;
-#
-#false:
-#veryVeryVeryVeryVeryVeryVeryVeryVeryVeryVeryLongDescription ?
-#	firstValue :
-#	SecondValueVeryVeryVeryVeryLong;
-BreakBeforeTernaryOperators: false
-
-BreakConstructorInitializersBeforeComma: false
-BreakConstructorInitializers: BeforeColon
-BreakAfterJavaFieldAnnotations: true
-BreakStringLiterals: true
-# So the line lengths in cudf are not following a limit, at the moment.
-# Usually it's a long comment that makes the line length inconsistent.
-# Command I used to find max line lengths (from cpp directory):
-#   find include src tests|grep "\." |xargs -I{} bash -c "awk '{print length}' {} | sort -rn | head -1"|sort -n
-# I picked 100, as it seemed somewhere around median
-ColumnLimit:     100
-# TODO: not aware of any of these at this time
-CommentPragmas:  '^ IWYU pragma:'
-# So it doesn't put subsequent namespaces in the same line
-CompactNamespaces: false
-ConstructorInitializerAllOnOneLineOrOnePerLine: false
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
-# TODO: adds spaces around the element list
-# in initializer: vector<T> x{ {}, ..., {} }
-Cpp11BracedListStyle: true
-DerivePointerAlignment: false
-DisableFormat:   false
-ExperimentalAutoDetectBinPacking: false
-# } // namespace a => useful
-FixNamespaceComments: true
-ForEachMacros:
-  - foreach
-  - Q_FOREACH
-  - BOOST_FOREACH
-IncludeBlocks:   Regroup
-IncludeCategories:
-  - Regex:           '<[[:alnum:]]+>'
-    Priority:        0
-  - Regex:           '<[[:alnum:].]+>'
-    Priority:        1
-  - Regex:           '<.*>'
-    Priority:        2
-  - Regex:           '.*/.*'
-    Priority:        3
-  - Regex:           '.*'
-    Priority:        4
-# if a header matches this in an include group, it will be moved up to the
-# top of the group.
-IncludeIsMainRegex: '(Test)?$'
-IndentCaseLabels: true
-IndentPPDirectives: None
-IndentWidth:     2
-IndentWrappedFunctionNames: false
-JavaScriptQuotes: Leave
-JavaScriptWrapImports: true
-KeepEmptyLinesAtTheStartOfBlocks: true
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBinPackProtocolList: Auto
-ObjCBlockIndentWidth: 2
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: true
-
-# Penalties: leaving unchanged for now
-# https://stackoverflow.com/questions/26635370/in-clang-format-what-do-the-penalties-do
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 19
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyBreakTemplateDeclaration: 10
-PenaltyExcessCharacter: 1000000
-# As currently set, we don't see return types being
-# left on their own line, leaving at 60
-PenaltyReturnTypeOnItsOwnLine: 60
-
-# char* foo vs char *foo, picking Right aligned
-PointerAlignment: Right
-ReflowComments:  true
-# leaving ON, but this could be something to turn OFF
-SortIncludes:    true
-SortUsingDeclarations: true
-SpaceAfterCStyleCast: false
-SpaceAfterTemplateKeyword: true
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 1
-SpacesInAngles:  false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard:        Cpp11
-TabWidth:        8
-UseTab:          Never
-...
diff --git a/java/src/main/native/clang-format.README b/java/src/main/native/clang-format.README
deleted file mode 100644
index 6c13289720a..00000000000
--- a/java/src/main/native/clang-format.README
+++ /dev/null
@@ -1,13 +0,0 @@
-README
-======
-
-To apply code formatting to a file you are working on, currently you can do this manually using
-clang-format-7:
-
-This will edit the file, and print to stdout:
-
-  clang-format [file]
-
-This will edit the file in place, do this if you are sure of what you are doing:
-
-  clang-format -i [file]
diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp
index f342fca8933..96ad1f23b8c 100644
--- a/java/src/main/native/include/jni_utils.hpp
+++ b/java/src/main/native/include/jni_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,48 +15,48 @@
  */
 #pragma once
 
-#include <algorithm>
-#include <memory>
-#include <vector>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/detail/error.hpp>
 
 #include <jni.h>
 
-#include <cudf/utilities/error.hpp>
-#include <rmm/detail/error.hpp>
+#include <algorithm>
+#include <memory>
+#include <vector>
 
 namespace cudf {
 namespace jni {
 
 constexpr jint MINIMUM_JNI_VERSION = JNI_VERSION_1_6;
 
-constexpr char const *CUDA_ERROR_CLASS = "ai/rapids/cudf/CudaException";
-constexpr char const *CUDA_FATAL_ERROR_CLASS = "ai/rapids/cudf/CudaFatalException";
-constexpr char const *CUDF_ERROR_CLASS = "ai/rapids/cudf/CudfException";
-constexpr char const *CUDF_OVERFLOW_ERROR_CLASS = "ai/rapids/cudf/CudfColumnSizeOverflowException";
-constexpr char const *CUDF_DTYPE_ERROR_CLASS = "ai/rapids/cudf/CudfException";
-constexpr char const *INDEX_OOB_CLASS = "java/lang/ArrayIndexOutOfBoundsException";
-constexpr char const *ILLEGAL_ARG_CLASS = "java/lang/IllegalArgumentException";
-constexpr char const *NPE_CLASS = "java/lang/NullPointerException";
-constexpr char const *OOM_CLASS = "java/lang/OutOfMemoryError";
+constexpr char const* CUDA_ERROR_CLASS          = "ai/rapids/cudf/CudaException";
+constexpr char const* CUDA_FATAL_ERROR_CLASS    = "ai/rapids/cudf/CudaFatalException";
+constexpr char const* CUDF_ERROR_CLASS          = "ai/rapids/cudf/CudfException";
+constexpr char const* CUDF_OVERFLOW_ERROR_CLASS = "ai/rapids/cudf/CudfColumnSizeOverflowException";
+constexpr char const* CUDF_DTYPE_ERROR_CLASS    = "ai/rapids/cudf/CudfException";
+constexpr char const* INDEX_OOB_CLASS           = "java/lang/ArrayIndexOutOfBoundsException";
+constexpr char const* ILLEGAL_ARG_CLASS         = "java/lang/IllegalArgumentException";
+constexpr char const* NPE_CLASS                 = "java/lang/NullPointerException";
+constexpr char const* OOM_CLASS                 = "java/lang/OutOfMemoryError";
 
 /**
  * @brief indicates that a JNI error of some kind was thrown and the main
  * function should return.
  */
 class jni_exception : public std::runtime_error {
-public:
-  jni_exception(char const *const message) : std::runtime_error(message) {}
-  jni_exception(std::string const &message) : std::runtime_error(message) {}
+ public:
+  jni_exception(char const* const message) : std::runtime_error(message) {}
+  jni_exception(std::string const& message) : std::runtime_error(message) {}
 };
 
 /**
  * @brief throw a java exception and a C++ one for flow control.
  */
-inline void throw_java_exception(JNIEnv *const env, const char *class_name, const char *message) {
+inline void throw_java_exception(JNIEnv* const env, const char* class_name, const char* message)
+{
   jclass ex_class = env->FindClass(class_name);
-  if (ex_class != NULL) {
-    env->ThrowNew(ex_class, message);
-  }
+  if (ex_class != NULL) { env->ThrowNew(ex_class, message); }
   throw jni_exception(message);
 }
 
@@ -64,7 +64,8 @@ inline void throw_java_exception(JNIEnv *const env, const char *class_name, cons
  * @brief check if an java exceptions have been thrown and if so throw a C++
  * exception so the flow control stop processing.
  */
-inline void check_java_exception(JNIEnv *const env) {
+inline void check_java_exception(JNIEnv* const env)
+{
   if (env->ExceptionCheck()) {
     // Not going to try to get the message out of the Exception, too complex and
     // might fail.
@@ -78,7 +79,9 @@ inline void check_java_exception(JNIEnv *const env) {
  * This is useful when, for instance, converting a cudf::column pointer
  * to a jlong, for use in JNI.
  */
-template <typename T> jlong ptr_as_jlong(T *ptr) {
+template <typename T>
+jlong ptr_as_jlong(T* ptr)
+{
   return reinterpret_cast<jlong>(ptr);
 }
 
@@ -86,7 +89,9 @@ template <typename T> jlong ptr_as_jlong(T *ptr) {
  * @brief Helper to release the data held by a unique_ptr, and return
  * the pointer as a jlong.
  */
-template <typename T> jlong release_as_jlong(std::unique_ptr<T> &&ptr) {
+template <typename T>
+jlong release_as_jlong(std::unique_ptr<T>&& ptr)
+{
   return ptr_as_jlong(ptr.release());
 }
 
@@ -94,96 +99,112 @@ template <typename T> jlong release_as_jlong(std::unique_ptr<T> &&ptr) {
  * @brief Helper to release the data held by a unique_ptr, and return
  * the pointer as a jlong.
  */
-template <typename T> jlong release_as_jlong(std::unique_ptr<T> &ptr) {
+template <typename T>
+jlong release_as_jlong(std::unique_ptr<T>& ptr)
+{
   return release_as_jlong(std::move(ptr));
 }
 
 class native_jdoubleArray_accessor {
-public:
-  jdouble *getArrayElements(JNIEnv *const env, jdoubleArray arr) const {
+ public:
+  jdouble* getArrayElements(JNIEnv* const env, jdoubleArray arr) const
+  {
     return env->GetDoubleArrayElements(arr, NULL);
   }
 
-  jdoubleArray newArray(JNIEnv *const env, int len) const { return env->NewDoubleArray(len); }
+  jdoubleArray newArray(JNIEnv* const env, int len) const { return env->NewDoubleArray(len); }
 
-  void setArrayRegion(JNIEnv *const env, jdoubleArray jarr, int start, int len,
-                      jdouble const *arr) const {
+  void setArrayRegion(
+    JNIEnv* const env, jdoubleArray jarr, int start, int len, jdouble const* arr) const
+  {
     env->SetDoubleArrayRegion(jarr, start, len, arr);
   }
 
-  void releaseArrayElements(JNIEnv *const env, jdoubleArray jarr, jdouble *arr, jint mode) const {
+  void releaseArrayElements(JNIEnv* const env, jdoubleArray jarr, jdouble* arr, jint mode) const
+  {
     env->ReleaseDoubleArrayElements(jarr, arr, mode);
   }
 };
 
 class native_jlongArray_accessor {
-public:
-  jlong *getArrayElements(JNIEnv *const env, jlongArray arr) const {
+ public:
+  jlong* getArrayElements(JNIEnv* const env, jlongArray arr) const
+  {
     return env->GetLongArrayElements(arr, NULL);
   }
 
-  jlongArray newArray(JNIEnv *const env, int len) const { return env->NewLongArray(len); }
+  jlongArray newArray(JNIEnv* const env, int len) const { return env->NewLongArray(len); }
 
-  void setArrayRegion(JNIEnv *const env, jlongArray jarr, int start, int len,
-                      jlong const *arr) const {
+  void setArrayRegion(
+    JNIEnv* const env, jlongArray jarr, int start, int len, jlong const* arr) const
+  {
     env->SetLongArrayRegion(jarr, start, len, arr);
   }
 
-  void releaseArrayElements(JNIEnv *const env, jlongArray jarr, jlong *arr, jint mode) const {
+  void releaseArrayElements(JNIEnv* const env, jlongArray jarr, jlong* arr, jint mode) const
+  {
     env->ReleaseLongArrayElements(jarr, arr, mode);
   }
 };
 
 class native_jintArray_accessor {
-public:
-  jint *getArrayElements(JNIEnv *const env, jintArray arr) const {
+ public:
+  jint* getArrayElements(JNIEnv* const env, jintArray arr) const
+  {
     return env->GetIntArrayElements(arr, NULL);
   }
 
-  jintArray newArray(JNIEnv *const env, int len) const { return env->NewIntArray(len); }
+  jintArray newArray(JNIEnv* const env, int len) const { return env->NewIntArray(len); }
 
-  void setArrayRegion(JNIEnv *const env, jintArray jarr, int start, int len,
-                      jint const *arr) const {
+  void setArrayRegion(JNIEnv* const env, jintArray jarr, int start, int len, jint const* arr) const
+  {
     env->SetIntArrayRegion(jarr, start, len, arr);
   }
 
-  void releaseArrayElements(JNIEnv *const env, jintArray jarr, jint *arr, jint mode) const {
+  void releaseArrayElements(JNIEnv* const env, jintArray jarr, jint* arr, jint mode) const
+  {
     env->ReleaseIntArrayElements(jarr, arr, mode);
   }
 };
 
 class native_jbyteArray_accessor {
-public:
-  jbyte *getArrayElements(JNIEnv *const env, jbyteArray arr) const {
+ public:
+  jbyte* getArrayElements(JNIEnv* const env, jbyteArray arr) const
+  {
     return env->GetByteArrayElements(arr, NULL);
   }
 
-  jbyteArray newArray(JNIEnv *const env, int len) const { return env->NewByteArray(len); }
+  jbyteArray newArray(JNIEnv* const env, int len) const { return env->NewByteArray(len); }
 
-  void setArrayRegion(JNIEnv *const env, jbyteArray jarr, int start, int len,
-                      jbyte const *arr) const {
+  void setArrayRegion(
+    JNIEnv* const env, jbyteArray jarr, int start, int len, jbyte const* arr) const
+  {
     env->SetByteArrayRegion(jarr, start, len, arr);
   }
 
-  void releaseArrayElements(JNIEnv *const env, jbyteArray jarr, jbyte *arr, jint mode) const {
+  void releaseArrayElements(JNIEnv* const env, jbyteArray jarr, jbyte* arr, jint mode) const
+  {
     env->ReleaseByteArrayElements(jarr, arr, mode);
   }
 };
 
 class native_jbooleanArray_accessor {
-public:
-  jboolean *getArrayElements(JNIEnv *const env, jbooleanArray arr) const {
+ public:
+  jboolean* getArrayElements(JNIEnv* const env, jbooleanArray arr) const
+  {
     return env->GetBooleanArrayElements(arr, NULL);
   }
 
-  jbooleanArray newArray(JNIEnv *const env, int len) const { return env->NewBooleanArray(len); }
+  jbooleanArray newArray(JNIEnv* const env, int len) const { return env->NewBooleanArray(len); }
 
-  void setArrayRegion(JNIEnv *const env, jbooleanArray jarr, int start, int len,
-                      jboolean const *arr) const {
+  void setArrayRegion(
+    JNIEnv* const env, jbooleanArray jarr, int start, int len, jboolean const* arr) const
+  {
     env->SetBooleanArrayRegion(jarr, start, len, arr);
   }
 
-  void releaseArrayElements(JNIEnv *const env, jbooleanArray jarr, jboolean *arr, jint mode) const {
+  void releaseArrayElements(JNIEnv* const env, jbooleanArray jarr, jboolean* arr, jint mode) const
+  {
     env->ReleaseBooleanArrayElements(jarr, arr, mode);
   }
 };
@@ -194,47 +215,52 @@ class native_jbooleanArray_accessor {
  * By default any changes to the array will be committed back when
  * the destructor is called unless cancel is called first.
  */
-template <typename N_TYPE, typename J_ARRAY_TYPE, typename ACCESSOR> class native_jArray {
-private:
+template <typename N_TYPE, typename J_ARRAY_TYPE, typename ACCESSOR>
+class native_jArray {
+ private:
   ACCESSOR access{};
-  JNIEnv *const env;
+  JNIEnv* const env;
   J_ARRAY_TYPE orig;
   int len;
-  mutable N_TYPE *data_ptr;
+  mutable N_TYPE* data_ptr;
 
-  void init_data_ptr() const {
+  void init_data_ptr() const
+  {
     if (orig != nullptr && data_ptr == nullptr) {
       data_ptr = access.getArrayElements(env, orig);
       check_java_exception(env);
     }
   }
 
-public:
-  native_jArray(native_jArray const &) = delete;
-  native_jArray &operator=(native_jArray const &) = delete;
+ public:
+  native_jArray(native_jArray const&)            = delete;
+  native_jArray& operator=(native_jArray const&) = delete;
 
-  native_jArray(JNIEnv *const env, J_ARRAY_TYPE orig)
-      : env(env), orig(orig), len(0), data_ptr(NULL) {
+  native_jArray(JNIEnv* const env, J_ARRAY_TYPE orig) : env(env), orig(orig), len(0), data_ptr(NULL)
+  {
     if (orig != NULL) {
       len = env->GetArrayLength(orig);
       check_java_exception(env);
     }
   }
 
-  native_jArray(JNIEnv *const env, int len)
-      : env(env), orig(access.newArray(env, len)), len(len), data_ptr(NULL) {
+  native_jArray(JNIEnv* const env, int len)
+    : env(env), orig(access.newArray(env, len)), len(len), data_ptr(NULL)
+  {
     check_java_exception(env);
   }
 
-  native_jArray(JNIEnv *const env, N_TYPE const *arr, int len)
-      : env(env), orig(access.newArray(env, len)), len(len), data_ptr(NULL) {
+  native_jArray(JNIEnv* const env, N_TYPE const* arr, int len)
+    : env(env), orig(access.newArray(env, len)), len(len), data_ptr(NULL)
+  {
     check_java_exception(env);
     access.setArrayRegion(env, orig, 0, len, arr);
     check_java_exception(env);
   }
 
-  native_jArray(JNIEnv *const env, const std::vector<N_TYPE> &arr)
-      : env(env), orig(access.newArray(env, arr.size())), len(arr.size()), data_ptr(NULL) {
+  native_jArray(JNIEnv* const env, const std::vector<N_TYPE>& arr)
+    : env(env), orig(access.newArray(env, arr.size())), len(arr.size()), data_ptr(NULL)
+  {
     check_java_exception(env);
     access.setArrayRegion(env, orig, 0, len, arr.data());
     check_java_exception(env);
@@ -244,43 +270,39 @@ template <typename N_TYPE, typename J_ARRAY_TYPE, typename ACCESSOR> class nativ
 
   int size() const noexcept { return len; }
 
-  N_TYPE operator[](int index) const {
-    if (orig == NULL) {
-      throw_java_exception(env, NPE_CLASS, "pointer is NULL");
-    }
-    if (index < 0 || index >= len) {
-      throw_java_exception(env, INDEX_OOB_CLASS, "NOT IN BOUNDS");
-    }
+  N_TYPE operator[](int index) const
+  {
+    if (orig == NULL) { throw_java_exception(env, NPE_CLASS, "pointer is NULL"); }
+    if (index < 0 || index >= len) { throw_java_exception(env, INDEX_OOB_CLASS, "NOT IN BOUNDS"); }
     return data()[index];
   }
 
-  N_TYPE &operator[](int index) {
-    if (orig == NULL) {
-      throw_java_exception(env, NPE_CLASS, "pointer is NULL");
-    }
-    if (index < 0 || index >= len) {
-      throw_java_exception(env, INDEX_OOB_CLASS, "NOT IN BOUNDS");
-    }
+  N_TYPE& operator[](int index)
+  {
+    if (orig == NULL) { throw_java_exception(env, NPE_CLASS, "pointer is NULL"); }
+    if (index < 0 || index >= len) { throw_java_exception(env, INDEX_OOB_CLASS, "NOT IN BOUNDS"); }
     return data()[index];
   }
 
-  const N_TYPE *const data() const {
+  const N_TYPE* const data() const
+  {
     init_data_ptr();
     return data_ptr;
   }
 
-  N_TYPE *data() {
+  N_TYPE* data()
+  {
     init_data_ptr();
     return data_ptr;
   }
 
-  const N_TYPE *const begin() const { return data(); }
+  const N_TYPE* const begin() const { return data(); }
 
-  N_TYPE *begin() { return data(); }
+  N_TYPE* begin() { return data(); }
 
-  const N_TYPE *const end() const { return data() + size(); }
+  const N_TYPE* const end() const { return data() + size(); }
 
-  N_TYPE *end() { return data() + size(); }
+  N_TYPE* end() { return data() + size(); }
 
   const J_ARRAY_TYPE get_jArray() const { return orig; }
 
@@ -292,7 +314,9 @@ template <typename N_TYPE, typename J_ARRAY_TYPE, typename ACCESSOR> class nativ
    * @tparam target_t Target data type
    * @return std::vector<target_t> Vector with the copied contents
    */
-  template <typename target_t = N_TYPE> std::vector<target_t> to_vector() const {
+  template <typename target_t = N_TYPE>
+  std::vector<target_t> to_vector() const
+  {
     std::vector<target_t> ret;
     ret.reserve(size());
     std::copy(begin(), end(), std::back_inserter(ret));
@@ -303,14 +327,16 @@ template <typename N_TYPE, typename J_ARRAY_TYPE, typename ACCESSOR> class nativ
    * @brief if data has been written back into this array, don't commit
    * it.
    */
-  void cancel() {
+  void cancel()
+  {
     if (data_ptr != NULL && orig != NULL) {
       access.releaseArrayElements(env, orig, data_ptr, JNI_ABORT);
       data_ptr = NULL;
     }
   }
 
-  void commit() {
+  void commit()
+  {
     if (data_ptr != NULL && orig != NULL) {
       access.releaseArrayElements(env, orig, data_ptr, 0);
       data_ptr = NULL;
@@ -321,9 +347,9 @@ template <typename N_TYPE, typename J_ARRAY_TYPE, typename ACCESSOR> class nativ
 };
 
 using native_jdoubleArray = native_jArray<jdouble, jdoubleArray, native_jdoubleArray_accessor>;
-using native_jlongArray = native_jArray<jlong, jlongArray, native_jlongArray_accessor>;
-using native_jintArray = native_jArray<jint, jintArray, native_jintArray_accessor>;
-using native_jbyteArray = native_jArray<jbyte, jbyteArray, native_jbyteArray_accessor>;
+using native_jlongArray   = native_jArray<jlong, jlongArray, native_jlongArray_accessor>;
+using native_jintArray    = native_jArray<jint, jintArray, native_jintArray_accessor>;
+using native_jbyteArray   = native_jArray<jbyte, jbyteArray, native_jbyteArray_accessor>;
 
 /**
  * @brief Specialization of native_jArray for jboolean
@@ -332,19 +358,23 @@ using native_jbyteArray = native_jArray<jbyte, jbyteArray, native_jbyteArray_acc
  * value is chosen depending on the jboolean value.
  */
 struct native_jbooleanArray
-    : native_jArray<jboolean, jbooleanArray, native_jbooleanArray_accessor> {
-  native_jbooleanArray(JNIEnv *const env, jbooleanArray orig)
-      : native_jArray<jboolean, jbooleanArray, native_jbooleanArray_accessor>(env, orig) {}
+  : native_jArray<jboolean, jbooleanArray, native_jbooleanArray_accessor> {
+  native_jbooleanArray(JNIEnv* const env, jbooleanArray orig)
+    : native_jArray<jboolean, jbooleanArray, native_jbooleanArray_accessor>(env, orig)
+  {
+  }
 
-  native_jbooleanArray(native_jbooleanArray const &) = delete;
-  native_jbooleanArray &operator=(native_jbooleanArray const &) = delete;
+  native_jbooleanArray(native_jbooleanArray const&)            = delete;
+  native_jbooleanArray& operator=(native_jbooleanArray const&) = delete;
 
   template <typename target_t>
-  std::vector<target_t> transform_if_else(target_t const &if_true, target_t const &if_false) const {
+  std::vector<target_t> transform_if_else(target_t const& if_true, target_t const& if_false) const
+  {
     std::vector<target_t> ret;
     ret.reserve(size());
-    std::transform(begin(), end(), std::back_inserter(ret),
-                   [&](jboolean const &b) { return b ? if_true : if_false; });
+    std::transform(begin(), end(), std::back_inserter(ret), [&](jboolean const& b) {
+      return b ? if_true : if_false;
+    });
     return ret;
   }
 };
@@ -355,58 +385,58 @@ struct native_jbooleanArray
  * By default any changes to the array will be committed back when
  * the destructor is called unless cancel is called first.
  */
-template <typename T> class native_jpointerArray {
-private:
+template <typename T>
+class native_jpointerArray {
+ private:
   native_jlongArray wrapped;
-  JNIEnv *const env;
+  JNIEnv* const env;
 
-public:
-  native_jpointerArray(native_jpointerArray const &) = delete;
-  native_jpointerArray &operator=(native_jpointerArray const &) = delete;
+ public:
+  native_jpointerArray(native_jpointerArray const&)            = delete;
+  native_jpointerArray& operator=(native_jpointerArray const&) = delete;
 
-  native_jpointerArray(JNIEnv *const env, jlongArray orig) : wrapped(env, orig), env(env) {}
+  native_jpointerArray(JNIEnv* const env, jlongArray orig) : wrapped(env, orig), env(env) {}
 
-  native_jpointerArray(JNIEnv *const env, int len) : wrapped(env, len), env(env) {}
+  native_jpointerArray(JNIEnv* const env, int len) : wrapped(env, len), env(env) {}
 
-  native_jpointerArray(JNIEnv *const env, T *arr, int len) : wrapped(env, arr, len), env(env) {}
+  native_jpointerArray(JNIEnv* const env, T* arr, int len) : wrapped(env, arr, len), env(env) {}
 
   bool is_null() const noexcept { return wrapped.is_null(); }
 
   int size() const noexcept { return wrapped.size(); }
 
-  T *operator[](int index) const {
-    if (data() == NULL) {
-      throw_java_exception(env, NPE_CLASS, "pointer is NULL");
-    }
+  T* operator[](int index) const
+  {
+    if (data() == NULL) { throw_java_exception(env, NPE_CLASS, "pointer is NULL"); }
     if (index < 0 || index >= wrapped.size()) {
       throw_java_exception(env, INDEX_OOB_CLASS, "NOT IN BOUNDS");
     }
     return data()[index];
   }
 
-  T *&operator[](int index) {
-    if (data() == NULL) {
-      throw_java_exception(env, NPE_CLASS, "pointer is NULL");
-    }
+  T*& operator[](int index)
+  {
+    if (data() == NULL) { throw_java_exception(env, NPE_CLASS, "pointer is NULL"); }
     if (index < 0 || index >= wrapped.size()) {
       throw_java_exception(env, INDEX_OOB_CLASS, "NOT IN BOUNDS");
     }
     return data()[index];
   }
 
-  T *const *data() const { return reinterpret_cast<T *const *>(wrapped.data()); }
+  T* const* data() const { return reinterpret_cast<T* const*>(wrapped.data()); }
 
-  T **data() { return reinterpret_cast<T **>(wrapped.data()); }
+  T** data() { return reinterpret_cast<T**>(wrapped.data()); }
 
-  T *const *begin() const { return data(); }
-  T *const *end() const { return data() + size(); }
+  T* const* begin() const { return data(); }
+  T* const* end() const { return data() + size(); }
 
   const jlongArray get_jArray() const { return wrapped.get_jArray(); }
 
   jlongArray get_jArray() { return wrapped.get_jArray(); }
 
-  void assert_no_nulls() const {
-    if (std::any_of(data(), data() + size(), [](T *const ptr) { return ptr == nullptr; })) {
+  void assert_no_nulls() const
+  {
+    if (std::any_of(data(), data() + size(), [](T* const ptr) { return ptr == nullptr; })) {
       throw_java_exception(env, NPE_CLASS, "pointer is NULL");
     }
   }
@@ -414,12 +444,13 @@ template <typename T> class native_jpointerArray {
   /**
    * @brief Convert from `T*[]` to `vector<T>`.
    */
-  std::vector<T> get_dereferenced() const {
+  std::vector<T> get_dereferenced() const
+  {
     assert_no_nulls();
     auto ret = std::vector<T>{};
     ret.reserve(size());
-    std::transform(data(), data() + size(), std::back_inserter(ret),
-                   [](T *const &p) { return *p; });
+    std::transform(
+      data(), data() + size(), std::back_inserter(ret), [](T* const& p) { return *p; });
     return ret;
   }
 
@@ -439,73 +470,82 @@ template <typename T> class native_jpointerArray {
  * By default any changes to the array will be committed back when
  * released unless cancel is called first.
  */
-template <typename T, typename D = std::default_delete<T>> class unique_jpointerArray {
-private:
+template <typename T, typename D = std::default_delete<T>>
+class unique_jpointerArray {
+ private:
   std::unique_ptr<native_jpointerArray<T>> wrapped;
   D del;
 
-public:
-  unique_jpointerArray(unique_jpointerArray const &) = delete;
-  unique_jpointerArray &operator=(unique_jpointerArray const &) = delete;
+ public:
+  unique_jpointerArray(unique_jpointerArray const&)            = delete;
+  unique_jpointerArray& operator=(unique_jpointerArray const&) = delete;
 
-  unique_jpointerArray(JNIEnv *const env, jlongArray orig)
-      : wrapped(new native_jpointerArray<T>(env, orig)) {}
+  unique_jpointerArray(JNIEnv* const env, jlongArray orig)
+    : wrapped(new native_jpointerArray<T>(env, orig))
+  {
+  }
 
-  unique_jpointerArray(JNIEnv *const env, jlongArray orig, const D &del)
-      : wrapped(new native_jpointerArray<T>(env, orig)), del(del) {}
+  unique_jpointerArray(JNIEnv* const env, jlongArray orig, const D& del)
+    : wrapped(new native_jpointerArray<T>(env, orig)), del(del)
+  {
+  }
 
-  unique_jpointerArray(JNIEnv *const env, int len)
-      : wrapped(new native_jpointerArray<T>(env, len)) {}
+  unique_jpointerArray(JNIEnv* const env, int len) : wrapped(new native_jpointerArray<T>(env, len))
+  {
+  }
 
-  unique_jpointerArray(JNIEnv *const env, int len, const D &del)
-      : wrapped(new native_jpointerArray<T>(env, len)), del(del) {}
+  unique_jpointerArray(JNIEnv* const env, int len, const D& del)
+    : wrapped(new native_jpointerArray<T>(env, len)), del(del)
+  {
+  }
 
-  unique_jpointerArray(JNIEnv *const env, T *arr, int len)
-      : wrapped(new native_jpointerArray<T>(env, arr, len)) {}
+  unique_jpointerArray(JNIEnv* const env, T* arr, int len)
+    : wrapped(new native_jpointerArray<T>(env, arr, len))
+  {
+  }
 
-  unique_jpointerArray(JNIEnv *const env, T *arr, int len, const D &del)
-      : wrapped(new native_jpointerArray<T>(env, arr, len)), del(del) {}
+  unique_jpointerArray(JNIEnv* const env, T* arr, int len, const D& del)
+    : wrapped(new native_jpointerArray<T>(env, arr, len)), del(del)
+  {
+  }
 
   bool is_null() const noexcept { return wrapped == NULL || wrapped->is_null(); }
 
   int size() const noexcept { return wrapped == NULL ? 0 : wrapped->size(); }
 
-  void reset(int index, T *new_ptr = NULL) {
-    if (wrapped == NULL) {
-      throw std::logic_error("using unique_jpointerArray after release");
-    }
-    T *old = (*wrapped)[index];
+  void reset(int index, T* new_ptr = NULL)
+  {
+    if (wrapped == NULL) { throw std::logic_error("using unique_jpointerArray after release"); }
+    T* old = (*wrapped)[index];
     if (old != new_ptr) {
       (*wrapped)[index] = new_ptr;
       del(old);
     }
   }
 
-  T *get(int index) {
-    if (wrapped == NULL) {
-      throw std::logic_error("using unique_jpointerArray after release");
-    }
+  T* get(int index)
+  {
+    if (wrapped == NULL) { throw std::logic_error("using unique_jpointerArray after release"); }
     return (*wrapped)[index];
   }
 
-  T *const *get() {
-    if (wrapped == NULL) {
-      throw std::logic_error("using unique_jpointerArray after release");
-    }
+  T* const* get()
+  {
+    if (wrapped == NULL) { throw std::logic_error("using unique_jpointerArray after release"); }
     return wrapped->data();
   }
 
-  jlongArray release() {
-    if (wrapped == NULL) {
-      return NULL;
-    }
+  jlongArray release()
+  {
+    if (wrapped == NULL) { return NULL; }
     wrapped->commit();
     jlongArray ret = wrapped->get_jArray();
     wrapped.reset(NULL);
     return ret;
   }
 
-  ~unique_jpointerArray() {
+  ~unique_jpointerArray()
+  {
     if (wrapped != NULL) {
       for (int i = 0; i < wrapped->size(); i++) {
         reset(i, NULL);
@@ -518,57 +558,62 @@ template <typename T, typename D = std::default_delete<T>> class unique_jpointer
  * @brief RAII for jstring to be sure it is handled correctly.
  */
 class native_jstring {
-private:
-  JNIEnv *env;
+ private:
+  JNIEnv* env;
   jstring orig;
-  mutable const char *cstr;
+  mutable const char* cstr;
   mutable size_t cstr_length;
 
-  void init_cstr() const {
+  void init_cstr() const
+  {
     if (orig != NULL && cstr == NULL) {
       cstr_length = env->GetStringUTFLength(orig);
-      cstr = env->GetStringUTFChars(orig, 0);
+      cstr        = env->GetStringUTFChars(orig, 0);
       check_java_exception(env);
     }
   }
 
-public:
-  native_jstring(native_jstring const &) = delete;
-  native_jstring &operator=(native_jstring const &) = delete;
+ public:
+  native_jstring(native_jstring const&)            = delete;
+  native_jstring& operator=(native_jstring const&) = delete;
 
-  native_jstring(native_jstring &&other) noexcept
-      : env(other.env), orig(other.orig), cstr(other.cstr), cstr_length(other.cstr_length) {
+  native_jstring(native_jstring&& other) noexcept
+    : env(other.env), orig(other.orig), cstr(other.cstr), cstr_length(other.cstr_length)
+  {
     other.cstr = NULL;
   }
 
-  native_jstring(JNIEnv *const env, jstring orig)
-      : env(env), orig(orig), cstr(NULL), cstr_length(0) {}
+  native_jstring(JNIEnv* const env, jstring orig) : env(env), orig(orig), cstr(NULL), cstr_length(0)
+  {
+  }
 
-  native_jstring &operator=(native_jstring const &&other) {
-    if (orig != NULL && cstr != NULL) {
-      env->ReleaseStringUTFChars(orig, cstr);
-    }
-    this->env = other.env;
-    this->orig = other.orig;
-    this->cstr = other.cstr;
+  native_jstring& operator=(native_jstring const&& other)
+  {
+    if (orig != NULL && cstr != NULL) { env->ReleaseStringUTFChars(orig, cstr); }
+    this->env         = other.env;
+    this->orig        = other.orig;
+    this->cstr        = other.cstr;
     this->cstr_length = other.cstr_length;
-    other.cstr = NULL;
+    other.cstr        = NULL;
     return *this;
   }
 
   bool is_null() const noexcept { return orig == NULL; }
 
-  const char *get() const {
+  const char* get() const
+  {
     init_cstr();
     return cstr;
   }
 
-  size_t size_bytes() const {
+  size_t size_bytes() const
+  {
     init_cstr();
     return cstr_length;
   }
 
-  bool is_empty() const {
+  bool is_empty() const
+  {
     if (cstr != NULL) {
       return cstr_length <= 0;
     } else if (orig != NULL) {
@@ -581,24 +626,25 @@ class native_jstring {
 
   const jstring get_jstring() const { return orig; }
 
-  ~native_jstring() {
-    if (orig != NULL && cstr != NULL) {
-      env->ReleaseStringUTFChars(orig, cstr);
-    }
+  ~native_jstring()
+  {
+    if (orig != NULL && cstr != NULL) { env->ReleaseStringUTFChars(orig, cstr); }
   }
 };
 
 /**
  * @brief jobjectArray wrapper to make accessing it more convenient.
  */
-template <typename T> class native_jobjectArray {
-private:
-  JNIEnv *const env;
+template <typename T>
+class native_jobjectArray {
+ private:
+  JNIEnv* const env;
   jobjectArray orig;
   int len;
 
-public:
-  native_jobjectArray(JNIEnv *const env, jobjectArray orig) : env(env), orig(orig), len(0) {
+ public:
+  native_jobjectArray(JNIEnv* const env, jobjectArray orig) : env(env), orig(orig), len(0)
+  {
     if (orig != NULL) {
       len = env->GetArrayLength(orig);
       check_java_exception(env);
@@ -611,19 +657,17 @@ template <typename T> class native_jobjectArray {
 
   T operator[](int index) const { return get(index); }
 
-  T get(int index) const {
-    if (orig == NULL) {
-      throw_java_exception(env, NPE_CLASS, "jobjectArray pointer is NULL");
-    }
+  T get(int index) const
+  {
+    if (orig == NULL) { throw_java_exception(env, NPE_CLASS, "jobjectArray pointer is NULL"); }
     T ret = static_cast<T>(env->GetObjectArrayElement(orig, index));
     check_java_exception(env);
     return ret;
   }
 
-  void set(int index, const T &val) {
-    if (orig == NULL) {
-      throw_java_exception(env, NPE_CLASS, "jobjectArray pointer is NULL");
-    }
+  void set(int index, const T& val)
+  {
+    if (orig == NULL) { throw_java_exception(env, NPE_CLASS, "jobjectArray pointer is NULL"); }
     env->SetObjectArrayElement(orig, index, val);
     check_java_exception(env);
   }
@@ -636,14 +680,15 @@ template <typename T> class native_jobjectArray {
  * and convenient.
  */
 class native_jstringArray {
-private:
-  JNIEnv *const env;
+ private:
+  JNIEnv* const env;
   native_jobjectArray<jstring> arr;
   mutable std::vector<native_jstring> cache;
   mutable std::vector<std::string> cpp_cache;
-  mutable std::vector<const char *> c_cache;
+  mutable std::vector<const char*> c_cache;
 
-  void init_cache() const {
+  void init_cache() const
+  {
     if (!arr.is_null() && cache.empty()) {
       int size = this->size();
       cache.reserve(size);
@@ -653,7 +698,8 @@ class native_jstringArray {
     }
   }
 
-  void init_c_cache() const {
+  void init_c_cache() const
+  {
     if (!arr.is_null() && c_cache.empty()) {
       init_cache();
       int size = this->size();
@@ -664,7 +710,8 @@ class native_jstringArray {
     }
   }
 
-  void init_cpp_cache() const {
+  void init_cpp_cache() const
+  {
     if (!arr.is_null() && cpp_cache.empty()) {
       init_cache();
       int size = this->size();
@@ -675,32 +722,30 @@ class native_jstringArray {
     }
   }
 
-  void update_caches(int index, jstring val) {
+  void update_caches(int index, jstring val)
+  {
     if (!cache.empty()) {
       cache[index] = native_jstring(env, val);
-      if (!c_cache.empty()) {
-        c_cache[index] = cache[index].get();
-      }
+      if (!c_cache.empty()) { c_cache[index] = cache[index].get(); }
 
-      if (!cpp_cache.empty()) {
-        cpp_cache[index] = cache[index].get();
-      }
+      if (!cpp_cache.empty()) { cpp_cache[index] = cache[index].get(); }
     } else if (!c_cache.empty() || !cpp_cache.empty()) {
       // Illegal state
       throw std::logic_error("CACHING IS MESSED UP");
     }
   }
 
-public:
-  native_jstringArray(JNIEnv *const env, jobjectArray orig) : env(env), arr(env, orig) {}
+ public:
+  native_jstringArray(JNIEnv* const env, jobjectArray orig) : env(env), arr(env, orig) {}
 
   bool is_null() const noexcept { return arr.is_null(); }
 
   int size() const noexcept { return arr.size(); }
 
-  native_jstring &operator[](int index) const { return get(index); }
+  native_jstring& operator[](int index) const { return get(index); }
 
-  native_jstring &get(int index) const {
+  native_jstring& get(int index) const
+  {
     if (arr.is_null()) {
       throw_java_exception(env, cudf::jni::NPE_CLASS, "jstringArray pointer is NULL");
     }
@@ -708,27 +753,32 @@ class native_jstringArray {
     return cache[index];
   }
 
-  const char **const as_c_array() const {
+  const char** const as_c_array() const
+  {
     init_c_cache();
     return c_cache.data();
   }
 
-  const std::vector<std::string> as_cpp_vector() const {
+  const std::vector<std::string> as_cpp_vector() const
+  {
     init_cpp_cache();
     return cpp_cache;
   }
 
-  void set(int index, jstring val) {
+  void set(int index, jstring val)
+  {
     arr.set(index, val);
     update_caches(index, val);
   }
 
-  void set(int index, const native_jstring &val) {
+  void set(int index, const native_jstring& val)
+  {
     arr.set(index, val.get_jstring());
     update_caches(index, val.get_jstring());
   }
 
-  void set(int index, const char *val) {
+  void set(int index, const char* val)
+  {
     jstring str = env->NewStringUTF(val);
     check_java_exception(env);
     arr.set(index, str);
@@ -739,8 +789,9 @@ class native_jstringArray {
 /**
  * @brief create a cuda exception from a given cudaError_t
  */
-inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowable cause = NULL) {
-  const char *ex_class_name;
+inline jthrowable cuda_exception(JNIEnv* const env, cudaError_t status, jthrowable cause = NULL)
+{
+  const char* ex_class_name;
 
   // Calls cudaGetLastError twice. It is nearly certain that a fatal error occurred if the second
   // call doesn't return with cudaSuccess.
@@ -755,19 +806,13 @@ inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowab
   }
 
   jclass ex_class = env->FindClass(ex_class_name);
-  if (ex_class == NULL) {
-    return NULL;
-  }
+  if (ex_class == NULL) { return NULL; }
   jmethodID ctor_id =
-      env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;ILjava/lang/Throwable;)V");
-  if (ctor_id == NULL) {
-    return NULL;
-  }
+    env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;ILjava/lang/Throwable;)V");
+  if (ctor_id == NULL) { return NULL; }
 
   jstring msg = env->NewStringUTF(cudaGetErrorString(status));
-  if (msg == NULL) {
-    return NULL;
-  }
+  if (msg == NULL) { return NULL; }
 
   jint err_code = static_cast<jint>(status);
 
@@ -775,168 +820,146 @@ inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowab
   return (jthrowable)ret;
 }
 
-inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
+inline void jni_cuda_check(JNIEnv* const env, cudaError_t cuda_status)
+{
   if (cudaSuccess != cuda_status) {
     jthrowable jt = cuda_exception(env, cuda_status);
-    if (jt != NULL) {
-      env->Throw(jt);
-    }
+    if (jt != NULL) { env->Throw(jt); }
     throw jni_exception(std::string("CUDA ERROR: code ") +
                         std::to_string(static_cast<int>(cuda_status)));
   }
 }
 
-inline auto add_global_ref(JNIEnv *env, jobject jobj) {
+inline auto add_global_ref(JNIEnv* env, jobject jobj)
+{
   auto new_global_ref = env->NewGlobalRef(jobj);
-  if (new_global_ref == nullptr) {
-    throw cudf::jni::jni_exception("global ref");
-  }
+  if (new_global_ref == nullptr) { throw cudf::jni::jni_exception("global ref"); }
   return new_global_ref;
 }
 
-inline nullptr_t del_global_ref(JNIEnv *env, jobject jobj) {
-  if (jobj != nullptr) {
-    env->DeleteGlobalRef(jobj);
-  }
+inline nullptr_t del_global_ref(JNIEnv* env, jobject jobj)
+{
+  if (jobj != nullptr) { env->DeleteGlobalRef(jobj); }
   return nullptr;
 }
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
 
-#define JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val)                                                 \
-  {                                                                                                \
-    if (env->ExceptionOccurred()) {                                                                \
-      return ret_val;                                                                              \
-    }                                                                                              \
+#define JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val)    \
+  {                                                   \
+    if (env->ExceptionOccurred()) { return ret_val; } \
   }
 
-#define JNI_THROW_NEW(env, class_name, message, ret_val)                                           \
-  {                                                                                                \
-    jclass ex_class = env->FindClass(class_name);                                                  \
-    if (ex_class == NULL) {                                                                        \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    env->ThrowNew(ex_class, message);                                                              \
-    return ret_val;                                                                                \
+#define JNI_THROW_NEW(env, class_name, message, ret_val) \
+  {                                                      \
+    jclass ex_class = env->FindClass(class_name);        \
+    if (ex_class == NULL) { return ret_val; }            \
+    env->ThrowNew(ex_class, message);                    \
+    return ret_val;                                      \
   }
 
 // Throw a new exception only if one is not pending then always return with the specified value
-#define JNI_CHECK_THROW_CUDF_EXCEPTION(env, class_name, message, stacktrace, ret_val)              \
-  {                                                                                                \
-    JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val);                                                    \
-    auto const ex_class = env->FindClass(class_name);                                              \
-    if (ex_class == nullptr) {                                                                     \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    auto const ctor_id =                                                                           \
-        env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;Ljava/lang/String;)V");           \
-    if (ctor_id == nullptr) {                                                                      \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    auto const empty_str = std::string{""};                                                        \
-    auto const jmessage = env->NewStringUTF(message == nullptr ? empty_str.c_str() : message);     \
-    if (jmessage == nullptr) {                                                                     \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    auto const jstacktrace =                                                                       \
-        env->NewStringUTF(stacktrace == nullptr ? empty_str.c_str() : stacktrace);                 \
-    if (jstacktrace == nullptr) {                                                                  \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    auto const jobj = env->NewObject(ex_class, ctor_id, jmessage, jstacktrace);                    \
-    if (jobj == nullptr) {                                                                         \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    env->Throw(reinterpret_cast<jthrowable>(jobj));                                                \
-    return ret_val;                                                                                \
+#define JNI_CHECK_THROW_CUDF_EXCEPTION(env, class_name, message, stacktrace, ret_val)           \
+  {                                                                                             \
+    JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val);                                                 \
+    auto const ex_class = env->FindClass(class_name);                                           \
+    if (ex_class == nullptr) { return ret_val; }                                                \
+    auto const ctor_id =                                                                        \
+      env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;Ljava/lang/String;)V");          \
+    if (ctor_id == nullptr) { return ret_val; }                                                 \
+    auto const empty_str = std::string{""};                                                     \
+    auto const jmessage  = env->NewStringUTF(message == nullptr ? empty_str.c_str() : message); \
+    if (jmessage == nullptr) { return ret_val; }                                                \
+    auto const jstacktrace =                                                                    \
+      env->NewStringUTF(stacktrace == nullptr ? empty_str.c_str() : stacktrace);                \
+    if (jstacktrace == nullptr) { return ret_val; }                                             \
+    auto const jobj = env->NewObject(ex_class, ctor_id, jmessage, jstacktrace);                 \
+    if (jobj == nullptr) { return ret_val; }                                                    \
+    env->Throw(reinterpret_cast<jthrowable>(jobj));                                             \
+    return ret_val;                                                                             \
   }
 
 // Throw a new exception only if one is not pending then always return with the specified value
-#define JNI_CHECK_THROW_CUDA_EXCEPTION(env, class_name, message, stacktrace, error_code, ret_val)  \
-  {                                                                                                \
-    JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val);                                                    \
-    auto const ex_class = env->FindClass(class_name);                                              \
-    if (ex_class == nullptr) {                                                                     \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    auto const ctor_id =                                                                           \
-        env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;Ljava/lang/String;I)V");          \
-    if (ctor_id == nullptr) {                                                                      \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    auto const empty_str = std::string{""};                                                        \
-    auto const jmessage = env->NewStringUTF(message == nullptr ? empty_str.c_str() : message);     \
-    if (jmessage == nullptr) {                                                                     \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    auto const jstacktrace =                                                                       \
-        env->NewStringUTF(stacktrace == nullptr ? empty_str.c_str() : stacktrace);                 \
-    if (jstacktrace == nullptr) {                                                                  \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    auto const jerror_code = static_cast<jint>(error_code);                                        \
-    auto const jobj = env->NewObject(ex_class, ctor_id, jmessage, jstacktrace, jerror_code);       \
-    if (jobj == nullptr) {                                                                         \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    env->Throw(reinterpret_cast<jthrowable>(jobj));                                                \
-    return ret_val;                                                                                \
-  }
-
-#define JNI_NULL_CHECK(env, obj, error_msg, ret_val)                                               \
-  {                                                                                                \
-    if ((obj) == 0) {                                                                              \
-      JNI_THROW_NEW(env, cudf::jni::NPE_CLASS, error_msg, ret_val);                                \
-    }                                                                                              \
-  }
-
-#define JNI_ARG_CHECK(env, obj, error_msg, ret_val)                                                \
-  {                                                                                                \
-    if (!(obj)) {                                                                                  \
-      JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, error_msg, ret_val);                        \
-    }                                                                                              \
-  }
-
-#define CATCH_STD_CLASS(env, class_name, ret_val)                                                  \
-  catch (const rmm::out_of_memory &e) {                                                            \
-    JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val);                                                    \
-    auto const what =                                                                              \
-        std::string("Could not allocate native memory: ") + (e.what() == nullptr ? "" : e.what()); \
-    JNI_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val);                               \
-  }                                                                                                \
-  catch (const cudf::fatal_cuda_error &e) {                                                        \
-    JNI_CHECK_THROW_CUDA_EXCEPTION(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e.what(),               \
-                                   e.stacktrace(), e.error_code(), ret_val);                       \
-  }                                                                                                \
-  catch (const cudf::cuda_error &e) {                                                              \
-    JNI_CHECK_THROW_CUDA_EXCEPTION(env, cudf::jni::CUDA_ERROR_CLASS, e.what(), e.stacktrace(),     \
-                                   e.error_code(), ret_val);                                       \
-  }                                                                                                \
-  catch (const cudf::data_type_error &e) {                                                         \
-    JNI_CHECK_THROW_CUDF_EXCEPTION(env, cudf::jni::CUDF_DTYPE_ERROR_CLASS, e.what(),               \
-                                   e.stacktrace(), ret_val);                                       \
-  }                                                                                                \
-  catch (std::overflow_error const &e) {                                                           \
-    JNI_CHECK_THROW_CUDF_EXCEPTION(env, cudf::jni::CUDF_OVERFLOW_ERROR_CLASS, e.what(),            \
-                                   "No native stacktrace is available.", ret_val);                 \
-  }                                                                                                \
-  catch (const std::exception &e) {                                                                \
-    char const *stacktrace = "No native stacktrace is available.";                                 \
-    if (auto const cudf_ex = dynamic_cast<cudf::logic_error const *>(&e); cudf_ex != nullptr) {    \
-      stacktrace = cudf_ex->stacktrace();                                                          \
-    }                                                                                              \
-    /* Double check whether the thrown exception is unrecoverable CUDA error or not. */            \
-    /* Like cudf::detail::throw_cuda_error, it is nearly certain that a fatal error  */            \
-    /* occurred if the second call doesn't return with cudaSuccess. */                             \
-    cudaGetLastError();                                                                            \
-    auto const last = cudaFree(0);                                                                 \
-    if (cudaSuccess != last && last == cudaDeviceSynchronize()) {                                  \
-      /* Throw CudaFatalException since the thrown exception is unrecoverable CUDA error */        \
-      JNI_CHECK_THROW_CUDA_EXCEPTION(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e.what(), stacktrace, \
-                                     last, ret_val);                                               \
-    }                                                                                              \
-    JNI_CHECK_THROW_CUDF_EXCEPTION(env, class_name, e.what(), stacktrace, ret_val);                \
+#define JNI_CHECK_THROW_CUDA_EXCEPTION(env, class_name, message, stacktrace, error_code, ret_val)   \
+  {                                                                                                 \
+    JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val);                                                     \
+    auto const ex_class = env->FindClass(class_name);                                               \
+    if (ex_class == nullptr) { return ret_val; }                                                    \
+    auto const ctor_id =                                                                            \
+      env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;Ljava/lang/String;I)V");             \
+    if (ctor_id == nullptr) { return ret_val; }                                                     \
+    auto const empty_str = std::string{""};                                                         \
+    auto const jmessage  = env->NewStringUTF(message == nullptr ? empty_str.c_str() : message);     \
+    if (jmessage == nullptr) { return ret_val; }                                                    \
+    auto const jstacktrace =                                                                        \
+      env->NewStringUTF(stacktrace == nullptr ? empty_str.c_str() : stacktrace);                    \
+    if (jstacktrace == nullptr) { return ret_val; }                                                 \
+    auto const jerror_code = static_cast<jint>(error_code);                                         \
+    auto const jobj        = env->NewObject(ex_class, ctor_id, jmessage, jstacktrace, jerror_code); \
+    if (jobj == nullptr) { return ret_val; }                                                        \
+    env->Throw(reinterpret_cast<jthrowable>(jobj));                                                 \
+    return ret_val;                                                                                 \
+  }
+
+#define JNI_NULL_CHECK(env, obj, error_msg, ret_val)                                  \
+  {                                                                                   \
+    if ((obj) == 0) { JNI_THROW_NEW(env, cudf::jni::NPE_CLASS, error_msg, ret_val); } \
+  }
+
+#define JNI_ARG_CHECK(env, obj, error_msg, ret_val)                                       \
+  {                                                                                       \
+    if (!(obj)) { JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, error_msg, ret_val); } \
+  }
+
+#define CATCH_STD_CLASS(env, class_name, ret_val)                                                 \
+  catch (const rmm::out_of_memory& e)                                                             \
+  {                                                                                               \
+    JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val);                                                   \
+    auto const what =                                                                             \
+      std::string("Could not allocate native memory: ") + (e.what() == nullptr ? "" : e.what());  \
+    JNI_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val);                              \
+  }                                                                                               \
+  catch (const cudf::fatal_cuda_error& e)                                                         \
+  {                                                                                               \
+    JNI_CHECK_THROW_CUDA_EXCEPTION(                                                               \
+      env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e.what(), e.stacktrace(), e.error_code(), ret_val); \
+  }                                                                                               \
+  catch (const cudf::cuda_error& e)                                                               \
+  {                                                                                               \
+    JNI_CHECK_THROW_CUDA_EXCEPTION(                                                               \
+      env, cudf::jni::CUDA_ERROR_CLASS, e.what(), e.stacktrace(), e.error_code(), ret_val);       \
+  }                                                                                               \
+  catch (const cudf::data_type_error& e)                                                          \
+  {                                                                                               \
+    JNI_CHECK_THROW_CUDF_EXCEPTION(                                                               \
+      env, cudf::jni::CUDF_DTYPE_ERROR_CLASS, e.what(), e.stacktrace(), ret_val);                 \
+  }                                                                                               \
+  catch (std::overflow_error const& e)                                                            \
+  {                                                                                               \
+    JNI_CHECK_THROW_CUDF_EXCEPTION(env,                                                           \
+                                   cudf::jni::CUDF_OVERFLOW_ERROR_CLASS,                          \
+                                   e.what(),                                                      \
+                                   "No native stacktrace is available.",                          \
+                                   ret_val);                                                      \
+  }                                                                                               \
+  catch (const std::exception& e)                                                                 \
+  {                                                                                               \
+    char const* stacktrace = "No native stacktrace is available.";                                \
+    if (auto const cudf_ex = dynamic_cast<cudf::logic_error const*>(&e); cudf_ex != nullptr) {    \
+      stacktrace = cudf_ex->stacktrace();                                                         \
+    }                                                                                             \
+    /* Double check whether the thrown exception is unrecoverable CUDA error or not. */           \
+    /* Like cudf::detail::throw_cuda_error, it is nearly certain that a fatal error  */           \
+    /* occurred if the second call doesn't return with cudaSuccess. */                            \
+    cudaGetLastError();                                                                           \
+    auto const last = cudaFree(0);                                                                \
+    if (cudaSuccess != last && last == cudaDeviceSynchronize()) {                                 \
+      /* Throw CudaFatalException since the thrown exception is unrecoverable CUDA error */       \
+      JNI_CHECK_THROW_CUDA_EXCEPTION(                                                             \
+        env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e.what(), stacktrace, last, ret_val);             \
+    }                                                                                             \
+    JNI_CHECK_THROW_CUDF_EXCEPTION(env, class_name, e.what(), stacktrace, ret_val);               \
   }
 
 #define CATCH_STD(env, ret_val) CATCH_STD_CLASS(env, cudf::jni::CUDF_ERROR_CLASS, ret_val)
diff --git a/java/src/main/native/include/maps_column_view.hpp b/java/src/main/native/include/maps_column_view.hpp
index 7d19615053d..be25dbd2e55 100644
--- a/java/src/main/native/include/maps_column_view.hpp
+++ b/java/src/main/native/include/maps_column_view.hpp
@@ -19,6 +19,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -38,16 +39,16 @@ namespace jni {
  * retrieve the corresponding value.
  */
 class maps_column_view {
-public:
-  maps_column_view(lists_column_view const &lists_of_structs,
+ public:
+  maps_column_view(lists_column_view const& lists_of_structs,
                    rmm::cuda_stream_view stream = cudf::get_default_stream());
 
   // Rule of 5.
-  maps_column_view(maps_column_view const &maps_view) = default;
-  maps_column_view(maps_column_view &&maps_view) = default;
-  maps_column_view &operator=(maps_column_view const &) = default;
-  maps_column_view &operator=(maps_column_view &&) = default;
-  ~maps_column_view() = default;
+  maps_column_view(maps_column_view const& maps_view)  = default;
+  maps_column_view(maps_column_view&& maps_view)       = default;
+  maps_column_view& operator=(maps_column_view const&) = default;
+  maps_column_view& operator=(maps_column_view&&)      = default;
+  ~maps_column_view()                                  = default;
 
   /**
    * @brief Returns number of map rows in the column.
@@ -59,14 +60,14 @@ class maps_column_view {
    *
    * Note: Keys are not deduped. Repeated keys are returned in order.
    */
-  lists_column_view const &keys() const { return keys_; }
+  lists_column_view const& keys() const { return keys_; }
 
   /**
    * @brief Getter for values as a list column.
    *
    * Note: Values for repeated keys are not dropped.
    */
-  lists_column_view const &values() const { return values_; }
+  lists_column_view const& values() const { return values_; }
 
   /**
    * @brief Map lookup by a column of keys.
@@ -83,9 +84,10 @@ class maps_column_view {
    * @param mr Device memory resource used to allocate the returned column's device memory.
    * @return std::unique_ptr<column> Column of values corresponding the value of the lookup key.
    */
-  std::unique_ptr<column>
-  get_values_for(column_view const &keys, rmm::cuda_stream_view stream = cudf::get_default_stream(),
-                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
+  std::unique_ptr<column> get_values_for(
+    column_view const& keys,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
   /**
    * @brief Map lookup by a scalar key.
@@ -101,9 +103,10 @@ class maps_column_view {
    * @param mr Device memory resource used to allocate the returned column's device memory.
    * @return std::unique_ptr<column>
    */
-  std::unique_ptr<column>
-  get_values_for(scalar const &key, rmm::cuda_stream_view stream = cudf::get_default_stream(),
-                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
+  std::unique_ptr<column> get_values_for(
+    scalar const& key,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
   /**
    * @brief Check if each map row contains a specified scalar key.
@@ -121,9 +124,10 @@ class maps_column_view {
    * @param mr Device memory resource used to allocate the returned column's device memory.
    * @return std::unique_ptr<column>
    */
-  std::unique_ptr<column>
-  contains(scalar const &key, rmm::cuda_stream_view stream = cudf::get_default_stream(),
-           rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
+  std::unique_ptr<column> contains(
+    scalar const& key,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
   /**
    * @brief Check if each map row contains keys specified by a column
@@ -142,13 +146,14 @@ class maps_column_view {
    * @return std::unique_ptr<column>
    */
 
-  std::unique_ptr<column>
-  contains(column_view const &key, rmm::cuda_stream_view stream = cudf::get_default_stream(),
-           rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
+  std::unique_ptr<column> contains(
+    column_view const& key,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
-private:
+ private:
   lists_column_view keys_, values_;
 };
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
diff --git a/java/src/main/native/src/Aggregation128UtilsJni.cpp b/java/src/main/native/src/Aggregation128UtilsJni.cpp
index 71c36cb724a..ed8a8dc1e5c 100644
--- a/java/src/main/native/src/Aggregation128UtilsJni.cpp
+++ b/java/src/main/native/src/Aggregation128UtilsJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,11 +21,12 @@
 extern "C" {
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation128Utils_extractInt32Chunk(
-    JNIEnv *env, jclass, jlong j_column_view, jint j_out_dtype, jint j_chunk_idx) {
+  JNIEnv* env, jclass, jlong j_column_view, jint j_out_dtype, jint j_chunk_idx)
+{
   JNI_NULL_CHECK(env, j_column_view, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto cview = reinterpret_cast<cudf::column_view const *>(j_column_view);
+    auto cview = reinterpret_cast<cudf::column_view const*>(j_column_view);
     auto dtype = cudf::jni::make_data_type(j_out_dtype, 0);
     return cudf::jni::release_as_jlong(cudf::jni::extract_chunk32(*cview, dtype, j_chunk_idx));
   }
@@ -33,13 +34,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation128Utils_extractInt32Chun
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Aggregation128Utils_combineInt64SumChunks(
-    JNIEnv *env, jclass, jlong j_table_view, jint j_dtype, jint j_scale) {
+  JNIEnv* env, jclass, jlong j_table_view, jint j_dtype, jint j_scale)
+{
   JNI_NULL_CHECK(env, j_table_view, "table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto tview = reinterpret_cast<cudf::table_view const *>(j_table_view);
+    auto tview = reinterpret_cast<cudf::table_view const*>(j_table_view);
     std::unique_ptr<cudf::table> result =
-        cudf::jni::assemble128_from_sum(*tview, cudf::jni::make_data_type(j_dtype, j_scale));
+      cudf::jni::assemble128_from_sum(*tview, cudf::jni::make_data_type(j_dtype, j_scale));
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
diff --git a/java/src/main/native/src/AggregationJni.cpp b/java/src/main/native/src/AggregationJni.cpp
index bc62e95c36a..c40f1c55500 100644
--- a/java/src/main/native/src/AggregationJni.cpp
+++ b/java/src/main/native/src/AggregationJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,85 +14,91 @@
  * limitations under the License.
  */
 
-#include <cudf/aggregation.hpp>
-
 #include "cudf_jni_apis.hpp"
 
+#include <cudf/aggregation.hpp>
+
 extern "C" {
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Aggregation_close(JNIEnv *env, jclass class_object,
-                                                             jlong ptr) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Aggregation_close(JNIEnv* env,
+                                                             jclass class_object,
+                                                             jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto to_del = reinterpret_cast<cudf::aggregation *>(ptr);
+    auto to_del = reinterpret_cast<cudf::aggregation*>(ptr);
     delete to_del;
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNoParamAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNoParamAgg(JNIEnv* env,
                                                                          jclass class_object,
-                                                                         jint kind) {
+                                                                         jint kind)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto ret = [&] {
       // These numbers come from Aggregation.java and must stay in sync
       switch (kind) {
-        case 0: // SUM
+        case 0:  // SUM
           return cudf::make_sum_aggregation();
-        case 1: // PRODUCT
+        case 1:  // PRODUCT
           return cudf::make_product_aggregation();
-        case 2: // MIN
+        case 2:  // MIN
           return cudf::make_min_aggregation();
-        case 3: // MAX
+        case 3:  // MAX
           return cudf::make_max_aggregation();
         // case 4 COUNT
-        case 5: // ANY
+        case 5:  // ANY
           return cudf::make_any_aggregation();
-        case 6: // ALL
+        case 6:  // ALL
           return cudf::make_all_aggregation();
-        case 7: // SUM_OF_SQUARES
+        case 7:  // SUM_OF_SQUARES
           return cudf::make_sum_of_squares_aggregation();
-        case 8: // MEAN
+        case 8:  // MEAN
           return cudf::make_mean_aggregation();
         // case 9: VARIANCE
         // case 10: STD
-        case 11: // MEDIAN
+        case 11:  // MEDIAN
           return cudf::make_median_aggregation();
         // case 12: QUANTILE
-        case 13: // ARGMAX
+        case 13:  // ARGMAX
           return cudf::make_argmax_aggregation();
-        case 14: // ARGMIN
+        case 14:  // ARGMIN
           return cudf::make_argmin_aggregation();
         // case 15: NUNIQUE
         // case 16: NTH_ELEMENT
-        case 17: // ROW_NUMBER
+        case 17:  // ROW_NUMBER
           return cudf::make_row_number_aggregation();
         // case 18: COLLECT_LIST
         // case 19: COLLECT_SET
-        case 20: // MERGE_LISTS
+        case 20:  // MERGE_LISTS
           return cudf::make_merge_lists_aggregation();
         // case 21: MERGE_SETS
         // case 22: LEAD
         // case 23: LAG
         // case 24: PTX
         // case 25: CUDA
-        case 26: // M2
+        case 26:  // M2
           return cudf::make_m2_aggregation();
-        case 27: // MERGE_M2
+        case 27:  // MERGE_M2
           return cudf::make_merge_m2_aggregation();
-        case 28: // RANK
-          return cudf::make_rank_aggregation(cudf::rank_method::MIN, {},
-                                             cudf::null_policy::INCLUDE);
-        case 29: // DENSE_RANK
-          return cudf::make_rank_aggregation(cudf::rank_method::DENSE, {},
-                                             cudf::null_policy::INCLUDE);
-        case 30: // ANSI SQL PERCENT_RANK
-          return cudf::make_rank_aggregation(cudf::rank_method::MIN, {}, cudf::null_policy::INCLUDE,
-                                             {}, cudf::rank_percentage::ONE_NORMALIZED);
-        case 33: // HISTOGRAM
+        case 28:  // RANK
+          return cudf::make_rank_aggregation(
+            cudf::rank_method::MIN, {}, cudf::null_policy::INCLUDE);
+        case 29:  // DENSE_RANK
+          return cudf::make_rank_aggregation(
+            cudf::rank_method::DENSE, {}, cudf::null_policy::INCLUDE);
+        case 30:  // ANSI SQL PERCENT_RANK
+          return cudf::make_rank_aggregation(cudf::rank_method::MIN,
+                                             {},
+                                             cudf::null_policy::INCLUDE,
+                                             {},
+                                             cudf::rank_percentage::ONE_NORMALIZED);
+        case 33:  // HISTOGRAM
           return cudf::make_histogram_aggregation();
-        case 34: // MERGE_HISTOGRAM
+        case 34:  // MERGE_HISTOGRAM
           return cudf::make_merge_histogram_aggregation();
 
         default: throw std::logic_error("Unsupported No Parameter Aggregation Operation");
@@ -104,33 +110,36 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNoParamAgg(JNIEnv
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNthAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNthAgg(JNIEnv* env,
                                                                      jclass class_object,
                                                                      jint offset,
-                                                                     jboolean include_nulls) {
+                                                                     jboolean include_nulls)
+{
   try {
     cudf::jni::auto_set_device(env);
 
     std::unique_ptr<cudf::aggregation> ret = cudf::make_nth_element_aggregation(
-        offset, include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE);
+      offset, include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE);
     return reinterpret_cast<jlong>(ret.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createDdofAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createDdofAgg(JNIEnv* env,
                                                                       jclass class_object,
-                                                                      jint kind, jint ddof) {
+                                                                      jint kind,
+                                                                      jint ddof)
+{
   try {
     cudf::jni::auto_set_device(env);
 
     std::unique_ptr<cudf::aggregation> ret;
     // These numbers come from Aggregation.java and must stay in sync
     switch (kind) {
-      case 9: // VARIANCE
+      case 9:  // VARIANCE
         ret = cudf::make_variance_aggregation(ddof);
         break;
-      case 10: // STD
+      case 10:  // STD
         ret = cudf::make_std_aggregation(ddof);
         break;
       default: throw std::logic_error("Unsupported DDOF Aggregation Operation");
@@ -140,19 +149,21 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createDdofAgg(JNIEnv *en
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createTDigestAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createTDigestAgg(JNIEnv* env,
                                                                          jclass class_object,
-                                                                         jint kind, jint delta) {
+                                                                         jint kind,
+                                                                         jint delta)
+{
   try {
     cudf::jni::auto_set_device(env);
 
     std::unique_ptr<cudf::aggregation> ret;
     // These numbers come from Aggregation.java and must stay in sync
     switch (kind) {
-      case 31: // TDIGEST
+      case 31:  // TDIGEST
         ret = cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta);
         break;
-      case 32: // MERGE_TDIGEST
+      case 32:  // MERGE_TDIGEST
         ret = cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(delta);
         break;
       default: throw std::logic_error("Unsupported TDigest Aggregation Operation");
@@ -162,22 +173,23 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createTDigestAgg(JNIEnv
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCountLikeAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCountLikeAgg(JNIEnv* env,
                                                                            jclass class_object,
                                                                            jint kind,
-                                                                           jboolean include_nulls) {
+                                                                           jboolean include_nulls)
+{
   try {
     cudf::jni::auto_set_device(env);
 
     cudf::null_policy policy =
-        include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
+      include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
     std::unique_ptr<cudf::aggregation> ret;
     // These numbers come from Aggregation.java and must stay in sync
     switch (kind) {
-      case 4: // COUNT
+      case 4:  // COUNT
         ret = cudf::make_count_aggregation(policy);
         break;
-      case 15: // NUNIQUE
+      case 15:  // NUNIQUE
         ret = cudf::make_nunique_aggregation(policy);
         break;
       default: throw std::logic_error("Unsupported Count Like Aggregation Operation");
@@ -187,10 +199,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCountLikeAgg(JNIEn
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createQuantAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createQuantAgg(JNIEnv* env,
                                                                        jclass class_object,
                                                                        jint j_method,
-                                                                       jdoubleArray j_quantiles) {
+                                                                       jdoubleArray j_quantiles)
+{
   JNI_NULL_CHECK(env, j_quantiles, "quantiles are null", 0);
   try {
     cudf::jni::auto_set_device(env);
@@ -206,19 +219,21 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createQuantAgg(JNIEnv *e
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createLeadLagAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createLeadLagAgg(JNIEnv* env,
                                                                          jclass class_object,
-                                                                         jint kind, jint offset) {
+                                                                         jint kind,
+                                                                         jint offset)
+{
   try {
     cudf::jni::auto_set_device(env);
 
     std::unique_ptr<cudf::aggregation> ret;
     // These numbers come from Aggregation.java and must stay in sync
     switch (kind) {
-      case 22: // LEAD
+      case 22:  // LEAD
         ret = cudf::make_lead_aggregation(offset);
         break;
-      case 23: // LAG
+      case 23:  // LAG
         ret = cudf::make_lag_aggregation(offset);
         break;
       default: throw std::logic_error("Unsupported Lead/Lag Aggregation Operation");
@@ -228,53 +243,57 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createLeadLagAgg(JNIEnv
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCollectListAgg(
-    JNIEnv *env, jclass class_object, jboolean include_nulls) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCollectListAgg(JNIEnv* env,
+                                                                             jclass class_object,
+                                                                             jboolean include_nulls)
+{
   try {
     cudf::jni::auto_set_device(env);
     cudf::null_policy policy =
-        include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
+      include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
     std::unique_ptr<cudf::aggregation> ret = cudf::make_collect_list_aggregation(policy);
     return reinterpret_cast<jlong>(ret.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCollectSetAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCollectSetAgg(JNIEnv* env,
                                                                             jclass class_object,
                                                                             jboolean include_nulls,
                                                                             jboolean nulls_equal,
-                                                                            jboolean nans_equal) {
+                                                                            jboolean nans_equal)
+{
   try {
     cudf::jni::auto_set_device(env);
     cudf::null_policy null_policy =
-        include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
+      include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
     cudf::null_equality null_equality =
-        nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+      nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
     cudf::nan_equality nan_equality =
-        nans_equal ? cudf::nan_equality::ALL_EQUAL : cudf::nan_equality::UNEQUAL;
+      nans_equal ? cudf::nan_equality::ALL_EQUAL : cudf::nan_equality::UNEQUAL;
     std::unique_ptr<cudf::aggregation> ret =
-        cudf::make_collect_set_aggregation(null_policy, null_equality, nan_equality);
+      cudf::make_collect_set_aggregation(null_policy, null_equality, nan_equality);
     return reinterpret_cast<jlong>(ret.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createMergeSetsAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createMergeSetsAgg(JNIEnv* env,
                                                                            jclass class_object,
                                                                            jboolean nulls_equal,
-                                                                           jboolean nans_equal) {
+                                                                           jboolean nans_equal)
+{
   try {
     cudf::jni::auto_set_device(env);
     cudf::null_equality null_equality =
-        nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+      nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
     cudf::nan_equality nan_equality =
-        nans_equal ? cudf::nan_equality::ALL_EQUAL : cudf::nan_equality::UNEQUAL;
+      nans_equal ? cudf::nan_equality::ALL_EQUAL : cudf::nan_equality::UNEQUAL;
     std::unique_ptr<cudf::aggregation> ret =
-        cudf::make_merge_sets_aggregation(null_equality, nan_equality);
+      cudf::make_merge_sets_aggregation(null_equality, nan_equality);
     return reinterpret_cast<jlong>(ret.release());
   }
   CATCH_STD(env, 0);
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/ChunkedPackJni.cpp b/java/src/main/native/src/ChunkedPackJni.cpp
index 746a67e1463..2512d74a113 100644
--- a/java/src/main/native/src/ChunkedPackJni.cpp
+++ b/java/src/main/native/src/ChunkedPackJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,59 +17,65 @@
 #include "cudf_jni_apis.hpp"
 
 extern "C" {
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackDelete(JNIEnv *env, jclass,
-                                                                         jlong chunked_pack) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackDelete(JNIEnv* env,
+                                                                         jclass,
+                                                                         jlong chunked_pack)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto cs = reinterpret_cast<cudf::chunked_pack *>(chunked_pack);
+    auto cs = reinterpret_cast<cudf::chunked_pack*>(chunked_pack);
     delete cs;
   }
   CATCH_STD(env, );
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackGetTotalContiguousSize(
-    JNIEnv *env, jclass, jlong chunked_pack) {
+  JNIEnv* env, jclass, jlong chunked_pack)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto cs = reinterpret_cast<cudf::chunked_pack *>(chunked_pack);
+    auto cs = reinterpret_cast<cudf::chunked_pack*>(chunked_pack);
     return cs->get_total_contiguous_size();
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackHasNext(JNIEnv *env, jclass,
-                                                                              jlong chunked_pack) {
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackHasNext(JNIEnv* env,
+                                                                              jclass,
+                                                                              jlong chunked_pack)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto cs = reinterpret_cast<cudf::chunked_pack *>(chunked_pack);
+    auto cs = reinterpret_cast<cudf::chunked_pack*>(chunked_pack);
     return cs->has_next();
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackNext(JNIEnv *env, jclass,
-                                                                        jlong chunked_pack,
-                                                                        jlong user_ptr,
-                                                                        jlong user_ptr_size) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackNext(
+  JNIEnv* env, jclass, jlong chunked_pack, jlong user_ptr, jlong user_ptr_size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto cs = reinterpret_cast<cudf::chunked_pack *>(chunked_pack);
-    auto user_buffer_span = cudf::device_span<uint8_t>(reinterpret_cast<uint8_t *>(user_ptr),
+    auto cs               = reinterpret_cast<cudf::chunked_pack*>(chunked_pack);
+    auto user_buffer_span = cudf::device_span<uint8_t>(reinterpret_cast<uint8_t*>(user_ptr),
                                                        static_cast<std::size_t>(user_ptr_size));
     return cs->next(user_buffer_span);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL
-Java_ai_rapids_cudf_ChunkedPack_chunkedPackBuildMetadata(JNIEnv *env, jclass, jlong chunked_pack) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackBuildMetadata(JNIEnv* env,
+                                                                                 jclass,
+                                                                                 jlong chunked_pack)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto cs = reinterpret_cast<cudf::chunked_pack *>(chunked_pack);
+    auto cs = reinterpret_cast<cudf::chunked_pack*>(chunked_pack);
     std::unique_ptr<std::vector<uint8_t>> result = cs->build_metadata();
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/ChunkedReaderJni.cpp b/java/src/main/native/src/ChunkedReaderJni.cpp
index 5ce23bbe712..7681008f584 100644
--- a/java/src/main/native/src/ChunkedReaderJni.cpp
+++ b/java/src/main/native/src/ChunkedReaderJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,20 +14,21 @@
  * limitations under the License.
  */
 
-#include <memory>
-#include <vector>
+#include "cudf_jni_apis.hpp"
+#include "jni_utils.hpp"
 
 #include <cudf/column/column.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/table/table.hpp>
 
-#include "cudf_jni_apis.hpp"
-#include "jni_utils.hpp"
+#include <memory>
+#include <vector>
 
 // This function is defined in `TableJni.cpp`.
-jlongArray
-cudf::jni::convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &&table_result,
-                                    std::vector<std::unique_ptr<cudf::column>> &&extra_columns);
+jlongArray cudf::jni::convert_table_for_return(
+  JNIEnv* env,
+  std::unique_ptr<cudf::table>&& table_result,
+  std::vector<std::unique_ptr<cudf::column>>&& extra_columns);
 
 // This file is for the code related to chunked reader (Parquet, ORC, etc.).
 
@@ -35,18 +36,28 @@ extern "C" {
 
 // This function should take all the parameters that `Table.readParquet` takes,
 // plus one more parameter `long chunkSizeByteLimit`.
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_create(
-    JNIEnv *env, jclass, jlong chunk_read_limit, jlong pass_read_limit,
-    jobjectArray filter_col_names, jbooleanArray j_col_binary_read, jstring inp_file_path,
-    jlong buffer, jlong buffer_length, jint unit) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env,
+                                                jclass,
+                                                jlong chunk_read_limit,
+                                                jlong pass_read_limit,
+                                                jobjectArray filter_col_names,
+                                                jbooleanArray j_col_binary_read,
+                                                jstring inp_file_path,
+                                                jlong buffer,
+                                                jlong buffer_length,
+                                                jint unit)
+{
   JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", 0);
   bool read_buffer = true;
   if (buffer == 0) {
     JNI_NULL_CHECK(env, inp_file_path, "Input file or buffer must be supplied", 0);
     read_buffer = false;
   } else if (inp_file_path != nullptr) {
-    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                  "Cannot pass in both a buffer and an inp_file_path", 0);
+    JNI_THROW_NEW(env,
+                  "java/lang/IllegalArgumentException",
+                  "Cannot pass in both a buffer and an inp_file_path",
+                  0);
   } else if (buffer_length <= 0) {
     JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "An empty buffer is not supported", 0);
   }
@@ -66,29 +77,35 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_create(
     cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
     (void)n_col_binary_read;
 
-    auto const source = read_buffer ?
-                            cudf::io::source_info(reinterpret_cast<char *>(buffer),
-                                                  static_cast<std::size_t>(buffer_length)) :
-                            cudf::io::source_info(filename.get());
+    auto const source = read_buffer ? cudf::io::source_info(reinterpret_cast<char*>(buffer),
+                                                            static_cast<std::size_t>(buffer_length))
+                                    : cudf::io::source_info(filename.get());
 
     auto opts_builder = cudf::io::parquet_reader_options::builder(source);
     if (n_filter_col_names.size() > 0) {
       opts_builder = opts_builder.columns(n_filter_col_names.as_cpp_vector());
     }
     auto const read_opts = opts_builder.convert_strings_to_categories(false)
-                               .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
-                               .build();
+                             .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+                             .build();
 
     return reinterpret_cast<jlong>(
-        new cudf::io::chunked_parquet_reader(static_cast<std::size_t>(chunk_read_limit),
-                                             static_cast<std::size_t>(pass_read_limit), read_opts));
+      new cudf::io::chunked_parquet_reader(static_cast<std::size_t>(chunk_read_limit),
+                                           static_cast<std::size_t>(pass_read_limit),
+                                           read_opts));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_createWithDataSource(
-    JNIEnv *env, jclass, jlong chunk_read_limit, jobjectArray filter_col_names,
-    jbooleanArray j_col_binary_read, jint unit, jlong ds_handle) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ParquetChunkedReader_createWithDataSource(JNIEnv* env,
+                                                              jclass,
+                                                              jlong chunk_read_limit,
+                                                              jobjectArray filter_col_names,
+                                                              jbooleanArray j_col_binary_read,
+                                                              jint unit,
+                                                              jlong ds_handle)
+{
   JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", 0);
   JNI_NULL_CHECK(env, ds_handle, "Null DataSouurce", 0);
 
@@ -103,7 +120,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_createWithDataS
     cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
     (void)n_col_binary_read;
 
-    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    auto ds = reinterpret_cast<cudf::io::datasource*>(ds_handle);
     cudf::io::source_info source{ds};
 
     auto opts_builder = cudf::io::parquet_reader_options::builder(source);
@@ -111,49 +128,55 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_createWithDataS
       opts_builder = opts_builder.columns(n_filter_col_names.as_cpp_vector());
     }
     auto const read_opts = opts_builder.convert_strings_to_categories(false)
-                               .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
-                               .build();
+                             .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+                             .build();
 
-    return reinterpret_cast<jlong>(new cudf::io::chunked_parquet_reader(
-        static_cast<std::size_t>(chunk_read_limit), read_opts));
+    return reinterpret_cast<jlong>(
+      new cudf::io::chunked_parquet_reader(static_cast<std::size_t>(chunk_read_limit), read_opts));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_hasNext(JNIEnv *env, jclass,
-                                                                            jlong handle) {
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_hasNext(JNIEnv* env,
+                                                                            jclass,
+                                                                            jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "handle is null", false);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const reader_ptr = reinterpret_cast<cudf::io::chunked_parquet_reader *const>(handle);
+    auto const reader_ptr = reinterpret_cast<cudf::io::chunked_parquet_reader* const>(handle);
     return reader_ptr->has_next();
   }
   CATCH_STD(env, false);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_readChunk(JNIEnv *env, jclass,
-                                                                                jlong handle) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_readChunk(JNIEnv* env,
+                                                                                jclass,
+                                                                                jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "handle is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const reader_ptr = reinterpret_cast<cudf::io::chunked_parquet_reader *const>(handle);
-    auto chunk = reader_ptr->read_chunk();
+    auto const reader_ptr = reinterpret_cast<cudf::io::chunked_parquet_reader* const>(handle);
+    auto chunk            = reader_ptr->read_chunk();
     return chunk.tbl ? cudf::jni::convert_table_for_return(env, chunk.tbl) : nullptr;
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_close(JNIEnv *env, jclass,
-                                                                      jlong handle) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_close(JNIEnv* env,
+                                                                      jclass,
+                                                                      jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "handle is null", );
 
   try {
     cudf::jni::auto_set_device(env);
-    delete reinterpret_cast<cudf::io::chunked_parquet_reader *>(handle);
+    delete reinterpret_cast<cudf::io::chunked_parquet_reader*>(handle);
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index e8a89f82a13..30a04e37d2c 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
-#include <algorithm>
+#include "cudf_jni_apis.hpp"
+#include "dtype_utils.hpp"
+#include "jni_utils.hpp"
 
-#include <arrow/api.h>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
@@ -33,90 +34,96 @@
 #include <cudf/strings/combine.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
+
 #include <rmm/mr/device/per_device_resource.hpp>
 
-#include "cudf_jni_apis.hpp"
-#include "dtype_utils.hpp"
-#include "jni_utils.hpp"
+#include <arrow/api.h>
+
+#include <algorithm>
 
 using cudf::jni::ptr_as_jlong;
 using cudf::jni::release_as_jlong;
 
 extern "C" {
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequence(JNIEnv *env, jclass,
-                                                                  jlong j_initial_val, jlong j_step,
-                                                                  jint row_count) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequence(
+  JNIEnv* env, jclass, jlong j_initial_val, jlong j_step, jint row_count)
+{
   JNI_NULL_CHECK(env, j_initial_val, "scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto initial_val = reinterpret_cast<cudf::scalar const *>(j_initial_val);
-    auto step = reinterpret_cast<cudf::scalar const *>(j_step);
-    return release_as_jlong(step ? cudf::sequence(row_count, *initial_val, *step) :
-                                   cudf::sequence(row_count, *initial_val));
+    auto initial_val = reinterpret_cast<cudf::scalar const*>(j_initial_val);
+    auto step        = reinterpret_cast<cudf::scalar const*>(j_step);
+    return release_as_jlong(step ? cudf::sequence(row_count, *initial_val, *step)
+                                 : cudf::sequence(row_count, *initial_val));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequences(JNIEnv *env, jclass,
-                                                                   jlong j_start_handle,
-                                                                   jlong j_size_handle,
-                                                                   jlong j_step_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequences(
+  JNIEnv* env, jclass, jlong j_start_handle, jlong j_size_handle, jlong j_step_handle)
+{
   JNI_NULL_CHECK(env, j_start_handle, "start is null", 0);
   JNI_NULL_CHECK(env, j_size_handle, "size is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto start = reinterpret_cast<cudf::column_view const *>(j_start_handle);
-    auto size = reinterpret_cast<cudf::column_view const *>(j_size_handle);
-    auto step = reinterpret_cast<cudf::column_view const *>(j_step_handle);
+    auto start = reinterpret_cast<cudf::column_view const*>(j_start_handle);
+    auto size  = reinterpret_cast<cudf::column_view const*>(j_size_handle);
+    auto step  = reinterpret_cast<cudf::column_view const*>(j_step_handle);
     auto ret =
-        step ? cudf::lists::sequences(*start, *step, *size) : cudf::lists::sequences(*start, *size);
+      step ? cudf::lists::sequences(*start, *step, *size) : cudf::lists::sequences(*start, *size);
     return release_as_jlong(ret);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(
-    JNIEnv *env, jclass, jint j_type, jlong j_col_length, jlong j_null_count, jobject j_data_obj,
-    jobject j_validity_obj, jobject j_offsets_obj) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(JNIEnv* env,
+                                                                   jclass,
+                                                                   jint j_type,
+                                                                   jlong j_col_length,
+                                                                   jlong j_null_count,
+                                                                   jobject j_data_obj,
+                                                                   jobject j_validity_obj,
+                                                                   jobject j_offsets_obj)
+{
   try {
     cudf::jni::auto_set_device(env);
     cudf::type_id n_type = static_cast<cudf::type_id>(j_type);
     // not all the buffers are used for all types
-    void const *data_address = 0;
-    int data_length = 0;
+    void const* data_address = 0;
+    int data_length          = 0;
     if (j_data_obj != 0) {
       data_address = env->GetDirectBufferAddress(j_data_obj);
-      data_length = env->GetDirectBufferCapacity(j_data_obj);
+      data_length  = env->GetDirectBufferCapacity(j_data_obj);
     }
-    void const *validity_address = 0;
-    int validity_length = 0;
+    void const* validity_address = 0;
+    int validity_length          = 0;
     if (j_validity_obj != 0) {
       validity_address = env->GetDirectBufferAddress(j_validity_obj);
-      validity_length = env->GetDirectBufferCapacity(j_validity_obj);
+      validity_length  = env->GetDirectBufferCapacity(j_validity_obj);
     }
-    void const *offsets_address = 0;
-    int offsets_length = 0;
+    void const* offsets_address = 0;
+    int offsets_length          = 0;
     if (j_offsets_obj != 0) {
       offsets_address = env->GetDirectBufferAddress(j_offsets_obj);
-      offsets_length = env->GetDirectBufferCapacity(j_offsets_obj);
+      offsets_length  = env->GetDirectBufferCapacity(j_offsets_obj);
     }
     auto data_buffer =
-        arrow::Buffer::Wrap(static_cast<const char *>(data_address), static_cast<int>(data_length));
-    auto null_buffer = arrow::Buffer::Wrap(static_cast<const char *>(validity_address),
+      arrow::Buffer::Wrap(static_cast<const char*>(data_address), static_cast<int>(data_length));
+    auto null_buffer    = arrow::Buffer::Wrap(static_cast<const char*>(validity_address),
                                            static_cast<int>(validity_length));
-    auto offsets_buffer = arrow::Buffer::Wrap(static_cast<const char *>(offsets_address),
+    auto offsets_buffer = arrow::Buffer::Wrap(static_cast<const char*>(offsets_address),
                                               static_cast<int>(offsets_length));
 
     std::shared_ptr<arrow::Array> arrow_array;
     switch (n_type) {
       case cudf::type_id::DECIMAL32:
-        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DECIMAL32 yet",
-                      0);
+        JNI_THROW_NEW(
+          env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DECIMAL32 yet", 0);
         break;
       case cudf::type_id::DECIMAL64:
-        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DECIMAL64 yet",
-                      0);
+        JNI_THROW_NEW(
+          env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DECIMAL64 yet", 0);
         break;
       case cudf::type_id::STRUCT:
         JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting STRUCT yet", 0);
@@ -125,23 +132,23 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(
         JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting LIST yet", 0);
         break;
       case cudf::type_id::DICTIONARY32:
-        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
-                      "Don't support converting DICTIONARY32 yet", 0);
+        JNI_THROW_NEW(
+          env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DICTIONARY32 yet", 0);
         break;
       case cudf::type_id::STRING:
-        arrow_array = std::make_shared<arrow::StringArray>(j_col_length, offsets_buffer,
-                                                           data_buffer, null_buffer, j_null_count);
+        arrow_array = std::make_shared<arrow::StringArray>(
+          j_col_length, offsets_buffer, data_buffer, null_buffer, j_null_count);
         break;
       default:
         // this handles the primitive types
-        arrow_array = cudf::detail::to_arrow_array(n_type, j_col_length, data_buffer, null_buffer,
-                                                   j_null_count);
+        arrow_array = cudf::detail::to_arrow_array(
+          n_type, j_col_length, data_buffer, null_buffer, j_null_count);
     }
-    auto name_and_type = arrow::field("col", arrow_array->type());
+    auto name_and_type                                = arrow::field("col", arrow_array->type());
     std::vector<std::shared_ptr<arrow::Field>> fields = {name_and_type};
-    std::shared_ptr<arrow::Schema> schema = std::make_shared<arrow::Schema>(fields);
+    std::shared_ptr<arrow::Schema> schema             = std::make_shared<arrow::Schema>(fields);
     auto arrow_table =
-        arrow::Table::Make(schema, std::vector<std::shared_ptr<arrow::Array>>{arrow_array});
+      arrow::Table::Make(schema, std::vector<std::shared_ptr<arrow::Array>>{arrow_array});
     auto retCols = cudf::from_arrow(*(arrow_table))->release();
     if (retCols.size() != 1) {
       JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Must result in one column", 0);
@@ -151,135 +158,155 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenation(
-    JNIEnv *env, jclass, jlongArray column_handles, jlong separator, jlong narep,
-    jboolean separate_nulls) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ColumnVector_stringConcatenation(JNIEnv* env,
+                                                     jclass,
+                                                     jlongArray column_handles,
+                                                     jlong separator,
+                                                     jlong narep,
+                                                     jboolean separate_nulls)
+{
   JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
   JNI_NULL_CHECK(env, separator, "separator string scalar object is null", 0);
   JNI_NULL_CHECK(env, narep, "narep string scalar object is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const auto &separator_scalar = *reinterpret_cast<cudf::string_scalar *>(separator);
-    const auto &narep_scalar = *reinterpret_cast<cudf::string_scalar *>(narep);
-    auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES :
-                                        cudf::strings::separator_on_nulls::NO;
+    const auto& separator_scalar = *reinterpret_cast<cudf::string_scalar*>(separator);
+    const auto& narep_scalar     = *reinterpret_cast<cudf::string_scalar*>(narep);
+    auto null_policy             = separate_nulls ? cudf::strings::separator_on_nulls::YES
+                                                  : cudf::strings::separator_on_nulls::NO;
 
     cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
     auto column_views = n_cudf_columns.get_dereferenced();
     return release_as_jlong(cudf::strings::concatenate(
-        cudf::table_view(column_views), separator_scalar, narep_scalar, null_policy));
+      cudf::table_view(column_views), separator_scalar, narep_scalar, null_policy));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationSepCol(
-    JNIEnv *env, jclass, jlongArray column_handles, jlong sep_handle, jlong separator_narep,
-    jlong col_narep, jboolean separate_nulls) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ColumnVector_stringConcatenationSepCol(JNIEnv* env,
+                                                           jclass,
+                                                           jlongArray column_handles,
+                                                           jlong sep_handle,
+                                                           jlong separator_narep,
+                                                           jlong col_narep,
+                                                           jboolean separate_nulls)
+{
   JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
   JNI_NULL_CHECK(env, sep_handle, "separator column handle is null", 0);
   JNI_NULL_CHECK(env, separator_narep, "separator narep string scalar object is null", 0);
   JNI_NULL_CHECK(env, col_narep, "column narep string scalar object is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const auto &separator_narep_scalar = *reinterpret_cast<cudf::string_scalar *>(separator_narep);
-    const auto &col_narep_scalar = *reinterpret_cast<cudf::string_scalar *>(col_narep);
-    auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES :
-                                        cudf::strings::separator_on_nulls::NO;
+    const auto& separator_narep_scalar = *reinterpret_cast<cudf::string_scalar*>(separator_narep);
+    const auto& col_narep_scalar       = *reinterpret_cast<cudf::string_scalar*>(col_narep);
+    auto null_policy                   = separate_nulls ? cudf::strings::separator_on_nulls::YES
+                                                        : cudf::strings::separator_on_nulls::NO;
 
     cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
-    auto column_views = n_cudf_columns.get_dereferenced();
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(sep_handle);
+    auto column_views         = n_cudf_columns.get_dereferenced();
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(sep_handle);
     cudf::strings_column_view strings_column(*column);
     return release_as_jlong(cudf::strings::concatenate(cudf::table_view(column_views),
-                                                       strings_column, separator_narep_scalar,
-                                                       col_narep_scalar, null_policy));
+                                                       strings_column,
+                                                       separator_narep_scalar,
+                                                       col_narep_scalar,
+                                                       null_policy));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatListByRow(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatListByRow(JNIEnv* env,
+                                                                         jclass,
                                                                          jlongArray column_handles,
-                                                                         jboolean ignore_null) {
+                                                                         jboolean ignore_null)
+{
   JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto null_policy = ignore_null ? cudf::lists::concatenate_null_policy::IGNORE :
-                                     cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW;
+    auto null_policy = ignore_null ? cudf::lists::concatenate_null_policy::IGNORE
+                                   : cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW;
 
     cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
     auto column_views = n_cudf_columns.get_dereferenced();
     return release_as_jlong(
-        cudf::lists::concatenate_rows(cudf::table_view(column_views), null_policy));
+      cudf::lists::concatenate_rows(cudf::table_view(column_views), null_policy));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeList(JNIEnv *env, jobject j_object,
-                                                                  jlongArray handles, jlong j_type,
-                                                                  jint scale, jlong row_count) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeList(
+  JNIEnv* env, jobject j_object, jlongArray handles, jlong j_type, jint scale, jlong row_count)
+{
   using ScalarType = cudf::scalar_type_t<cudf::size_type>;
   JNI_NULL_CHECK(env, handles, "native view handles are null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto children = cudf::jni::native_jpointerArray<cudf::column_view>(env, handles);
+    auto children        = cudf::jni::native_jpointerArray<cudf::column_view>(env, handles);
     auto children_vector = children.get_dereferenced();
-    auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
+    auto zero            = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
     zero->set_valid_async(true);
-    static_cast<ScalarType *>(zero.get())->set_value(0);
+    static_cast<ScalarType*>(zero.get())->set_value(0);
 
     if (children.size() == 0) {
       // special case because cudf::interleave_columns does not support no columns
-      auto offsets = cudf::make_column_from_scalar(*zero, row_count + 1);
+      auto offsets                = cudf::make_column_from_scalar(*zero, row_count + 1);
       cudf::data_type n_data_type = cudf::jni::make_data_type(j_type, scale);
-      auto empty_col = cudf::make_empty_column(n_data_type);
+      auto empty_col              = cudf::make_empty_column(n_data_type);
       return release_as_jlong(cudf::make_lists_column(
-          row_count, std::move(offsets), std::move(empty_col), 0, rmm::device_buffer()));
+        row_count, std::move(offsets), std::move(empty_col), 0, rmm::device_buffer()));
     } else {
       auto count = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
       count->set_valid_async(true);
-      static_cast<ScalarType *>(count.get())->set_value(children.size());
+      static_cast<ScalarType*>(count.get())->set_value(children.size());
 
       std::unique_ptr<cudf::column> offsets = cudf::sequence(row_count + 1, *zero, *count);
       auto data_col = cudf::interleave_columns(cudf::table_view(children_vector));
       return release_as_jlong(cudf::make_lists_column(
-          row_count, std::move(offsets), std::move(data_col), 0, rmm::device_buffer()));
+        row_count, std::move(offsets), std::move(data_col), 0, rmm::device_buffer()));
     }
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeListFromOffsets(
-    JNIEnv *env, jobject j_object, jlong child_handle, jlong offsets_handle, jlong row_count) {
+  JNIEnv* env, jobject j_object, jlong child_handle, jlong offsets_handle, jlong row_count)
+{
   JNI_NULL_CHECK(env, child_handle, "child_handle is null", 0)
   JNI_NULL_CHECK(env, offsets_handle, "offsets_handle is null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto const child_cv = reinterpret_cast<cudf::column_view const *>(child_handle);
-    auto const offsets_cv = reinterpret_cast<cudf::column_view const *>(offsets_handle);
+    auto const child_cv   = reinterpret_cast<cudf::column_view const*>(child_handle);
+    auto const offsets_cv = reinterpret_cast<cudf::column_view const*>(offsets_handle);
     CUDF_EXPECTS(offsets_cv->type().id() == cudf::type_id::INT32,
                  "Input offsets does not have type INT32.");
 
-    return release_as_jlong(cudf::make_lists_column(
-        static_cast<cudf::size_type>(row_count), std::make_unique<cudf::column>(*offsets_cv),
-        std::make_unique<cudf::column>(*child_cv), 0, {}));
+    return release_as_jlong(cudf::make_lists_column(static_cast<cudf::size_type>(row_count),
+                                                    std::make_unique<cudf::column>(*offsets_cv),
+                                                    std::make_unique<cudf::column>(*child_cv),
+                                                    0,
+                                                    {}));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromScalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromScalar(JNIEnv* env,
+                                                                    jclass,
                                                                     jlong j_scalar,
-                                                                    jint row_count) {
+                                                                    jint row_count)
+{
   JNI_NULL_CHECK(env, j_scalar, "scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto scalar_val = reinterpret_cast<cudf::scalar const *>(j_scalar);
+    auto scalar_val = reinterpret_cast<cudf::scalar const*>(j_scalar);
     if (scalar_val->type().id() == cudf::type_id::STRUCT && row_count == 0) {
       // Specialize the creation of empty struct column, since libcudf doesn't support it.
-      auto struct_scalar = reinterpret_cast<cudf::struct_scalar const *>(j_scalar);
-      auto children = cudf::empty_like(struct_scalar->view())->release();
-      auto mask_buffer = cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED);
+      auto struct_scalar = reinterpret_cast<cudf::struct_scalar const*>(j_scalar);
+      auto children      = cudf::empty_like(struct_scalar->view())->release();
+      auto mask_buffer   = cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED);
       return release_as_jlong(
-          cudf::make_structs_column(0, std::move(children), 0, std::move(mask_buffer)));
+        cudf::make_structs_column(0, std::move(children), 0, std::move(mask_buffer)));
     } else {
       return release_as_jlong(cudf::make_column_from_scalar(*scalar_val, row_count));
     }
@@ -287,31 +314,36 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromScalar(JNIEnv *env,
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatenate(JNIEnv *env, jclass clazz,
-                                                                     jlongArray column_handles) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatenate(JNIEnv* env,
+                                                                     jclass clazz,
+                                                                     jlongArray column_handles)
+{
   JNI_NULL_CHECK(env, column_handles, "input columns are null", 0);
   using cudf::column;
   using cudf::column_view;
   try {
     cudf::jni::auto_set_device(env);
     auto columns =
-        cudf::jni::native_jpointerArray<column_view>{env, column_handles}.get_dereferenced();
+      cudf::jni::native_jpointerArray<column_view>{env, column_handles}.get_dereferenced();
     auto const is_lists_column = columns[0].type().id() == cudf::type_id::LIST;
     return release_as_jlong(
-        is_lists_column ? cudf::lists::detail::concatenate(columns, cudf::get_default_stream(),
-                                                           rmm::mr::get_current_device_resource()) :
-                          cudf::concatenate(columns));
+      is_lists_column
+        ? cudf::lists::detail::concatenate(
+            columns, cudf::get_default_stream(), rmm::mr::get_current_device_resource())
+        : cudf::concatenate(columns));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_md5(JNIEnv *env, jobject j_object,
-                                                             jlongArray column_handles) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_md5(JNIEnv* env,
+                                                             jobject j_object,
+                                                             jlongArray column_handles)
+{
   JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
 
   try {
     auto column_views =
-        cudf::jni::native_jpointerArray<cudf::column_view>{env, column_handles}.get_dereferenced();
+      cudf::jni::native_jpointerArray<cudf::column_view>{env, column_handles}.get_dereferenced();
     return release_as_jlong(cudf::hashing::md5(cudf::table_view{column_views}));
   }
   CATCH_STD(env, 0);
@@ -323,46 +355,50 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_md5(JNIEnv *env, jobjec
 // only be called from the CudfColumn child class.
 ////////
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_ColumnVector_deleteCudfColumn(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ColumnVector_deleteCudfColumn(JNIEnv* env,
                                                                          jobject j_object,
-                                                                         jlong handle) {
+                                                                         jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "column handle is null", );
   try {
     cudf::jni::auto_set_device(env);
-    delete reinterpret_cast<cudf::column *>(handle);
+    delete reinterpret_cast<cudf::column*>(handle);
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_ColumnVector_setNativeNullCountColumn(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ColumnVector_setNativeNullCountColumn(JNIEnv* env,
                                                                                  jobject j_object,
                                                                                  jlong handle,
-                                                                                 jint null_count) {
+                                                                                 jint null_count)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", );
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column *column = reinterpret_cast<cudf::column *>(handle);
+    cudf::column* column = reinterpret_cast<cudf::column*>(handle);
     column->set_null_count(null_count);
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_getNativeColumnView(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_getNativeColumnView(JNIEnv* env,
                                                                              jobject j_object,
-                                                                             jlong handle) {
+                                                                             jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column *column = reinterpret_cast<cudf::column *>(handle);
+    cudf::column* column = reinterpret_cast<cudf::column*>(handle);
     return ptr_as_jlong(new cudf::column_view{*column});
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeEmptyCudfColumn(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeEmptyCudfColumn(JNIEnv* env,
+                                                                             jclass,
                                                                              jint j_type,
-                                                                             jint scale) {
-
+                                                                             jint scale)
+{
   try {
     cudf::jni::auto_set_device(env);
     cudf::data_type n_data_type = cudf::jni::make_data_type(j_type, scale);
@@ -371,15 +407,16 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeEmptyCudfColumn(JNI
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnVector_getNativeNullCountColumn(JNIEnv *env,
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnVector_getNativeNullCountColumn(JNIEnv* env,
                                                                                  jobject j_object,
-                                                                                 jlong handle) {
+                                                                                 jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column *column = reinterpret_cast<cudf::column *>(handle);
+    cudf::column* column = reinterpret_cast<cudf::column*>(handle);
     return static_cast<jint>(column->null_count());
   }
   CATCH_STD(env, 0);
 }
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index dd3859a4160..086d4672788 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -15,9 +15,11 @@
  */
 
 #include "ColumnViewJni.hpp"
-#include <numeric>
 
-#include <jni.h>
+#include "cudf_jni_apis.hpp"
+#include "dtype_utils.hpp"
+#include "jni_utils.hpp"
+#include "maps_column_view.hpp"
 
 #include <cudf/aggregation.hpp>
 #include <cudf/binaryop.hpp>
@@ -81,17 +83,17 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-#include "cudf_jni_apis.hpp"
-#include "dtype_utils.hpp"
-#include "jni_utils.hpp"
-#include "maps_column_view.hpp"
+#include <jni.h>
+
+#include <numeric>
 
 using cudf::jni::ptr_as_jlong;
 using cudf::jni::release_as_jlong;
 
 namespace {
 
-std::size_t pad_size(std::size_t size, bool const should_pad_for_cpu) {
+std::size_t pad_size(std::size_t size, bool const should_pad_for_cpu)
+{
   if (should_pad_for_cpu) {
     constexpr std::size_t ALIGN = sizeof(std::max_align_t);
     return (size + (ALIGN - 1)) & ~(ALIGN - 1);
@@ -100,9 +102,10 @@ std::size_t pad_size(std::size_t size, bool const should_pad_for_cpu) {
   }
 }
 
-std::size_t calc_device_memory_size(cudf::column_view const &view, bool const pad_for_cpu) {
+std::size_t calc_device_memory_size(cudf::column_view const& view, bool const pad_for_cpu)
+{
   std::size_t total = 0;
-  auto row_count = view.size();
+  auto row_count    = view.size();
 
   if (view.nullable()) {
     total += pad_size(cudf::bitmask_allocation_size_bytes(row_count), pad_for_cpu);
@@ -116,249 +119,274 @@ std::size_t calc_device_memory_size(cudf::column_view const &view, bool const pa
     total += pad_size(scv.chars_size(cudf::get_default_stream()), pad_for_cpu);
   }
 
-  return std::accumulate(view.child_begin(), view.child_end(), total,
-                         [pad_for_cpu](std::size_t t, cudf::column_view const &v) {
+  return std::accumulate(view.child_begin(),
+                         view.child_end(),
+                         total,
+                         [pad_for_cpu](std::size_t t, cudf::column_view const& v) {
                            return t + calc_device_memory_size(v, pad_for_cpu);
                          });
 }
 
-} // anonymous namespace
+}  // anonymous namespace
 
 extern "C" {
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_upperStrings(JNIEnv *env, jobject j_object,
-                                                                    jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_upperStrings(JNIEnv* env,
+                                                                    jobject j_object,
+                                                                    jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "column is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     cudf::strings_column_view strings_column(*column);
     return release_as_jlong(cudf::strings::to_upper(strings_column));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_lowerStrings(JNIEnv *env, jobject j_object,
-                                                                    jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_lowerStrings(JNIEnv* env,
+                                                                    jobject j_object,
+                                                                    jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "column is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     cudf::strings_column_view strings_column(*column);
     return release_as_jlong(cudf::strings::to_lower(strings_column));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsScalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsScalar(JNIEnv* env,
+                                                                          jclass,
                                                                           jlong j_col,
-                                                                          jlong j_scalar) {
+                                                                          jlong j_scalar)
+{
   JNI_NULL_CHECK(env, j_col, "column is null", 0);
   JNI_NULL_CHECK(env, j_scalar, "scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view col = *reinterpret_cast<cudf::column_view *>(j_col);
-    auto val = reinterpret_cast<cudf::scalar *>(j_scalar);
+    cudf::column_view col = *reinterpret_cast<cudf::column_view*>(j_col);
+    auto val              = reinterpret_cast<cudf::scalar*>(j_scalar);
     return release_as_jlong(cudf::replace_nulls(col, *val));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsColumn(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsColumn(JNIEnv* env,
+                                                                          jclass,
                                                                           jlong j_col,
-                                                                          jlong j_replace_col) {
+                                                                          jlong j_replace_col)
+{
   JNI_NULL_CHECK(env, j_col, "column is null", 0);
   JNI_NULL_CHECK(env, j_replace_col, "replacement column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto col = reinterpret_cast<cudf::column_view *>(j_col);
-    auto replacements = reinterpret_cast<cudf::column_view *>(j_replace_col);
+    auto col          = reinterpret_cast<cudf::column_view*>(j_col);
+    auto replacements = reinterpret_cast<cudf::column_view*>(j_replace_col);
     return release_as_jlong(cudf::replace_nulls(*col, *replacements));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsPolicy(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsPolicy(JNIEnv* env,
+                                                                          jclass,
                                                                           jlong j_col,
-                                                                          jboolean is_preceding) {
+                                                                          jboolean is_preceding)
+{
   JNI_NULL_CHECK(env, j_col, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view col = *reinterpret_cast<cudf::column_view *>(j_col);
+    cudf::column_view col = *reinterpret_cast<cudf::column_view*>(j_col);
     return release_as_jlong(cudf::replace_nulls(
-        col, is_preceding ? cudf::replace_policy::PRECEDING : cudf::replace_policy::FOLLOWING));
+      col, is_preceding ? cudf::replace_policy::PRECEDING : cudf::replace_policy::FOLLOWING));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_distinctCount(JNIEnv *env, jclass,
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_distinctCount(JNIEnv* env,
+                                                                    jclass,
                                                                     jlong j_col,
-                                                                    jboolean nulls_included) {
+                                                                    jboolean nulls_included)
+{
   JNI_NULL_CHECK(env, j_col, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view col = *reinterpret_cast<cudf::column_view *>(j_col);
+    cudf::column_view col = *reinterpret_cast<cudf::column_view*>(j_col);
 
     return cudf::distinct_count(
-        col, nulls_included ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE,
-        cudf::nan_policy::NAN_IS_VALID);
+      col,
+      nulls_included ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE,
+      cudf::nan_policy::NAN_IS_VALID);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseVV(JNIEnv *env, jclass,
-                                                                jlong j_pred_vec, jlong j_true_vec,
-                                                                jlong j_false_vec) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseVV(
+  JNIEnv* env, jclass, jlong j_pred_vec, jlong j_true_vec, jlong j_false_vec)
+{
   JNI_NULL_CHECK(env, j_pred_vec, "predicate column is null", 0);
   JNI_NULL_CHECK(env, j_true_vec, "true column is null", 0);
   JNI_NULL_CHECK(env, j_false_vec, "false column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto pred_vec = reinterpret_cast<cudf::column_view *>(j_pred_vec);
-    auto true_vec = reinterpret_cast<cudf::column_view *>(j_true_vec);
-    auto false_vec = reinterpret_cast<cudf::column_view *>(j_false_vec);
+    auto pred_vec  = reinterpret_cast<cudf::column_view*>(j_pred_vec);
+    auto true_vec  = reinterpret_cast<cudf::column_view*>(j_true_vec);
+    auto false_vec = reinterpret_cast<cudf::column_view*>(j_false_vec);
     return release_as_jlong(cudf::copy_if_else(*true_vec, *false_vec, *pred_vec));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseVS(JNIEnv *env, jclass,
-                                                                jlong j_pred_vec, jlong j_true_vec,
-                                                                jlong j_false_scalar) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseVS(
+  JNIEnv* env, jclass, jlong j_pred_vec, jlong j_true_vec, jlong j_false_scalar)
+{
   JNI_NULL_CHECK(env, j_pred_vec, "predicate column is null", 0);
   JNI_NULL_CHECK(env, j_true_vec, "true column is null", 0);
   JNI_NULL_CHECK(env, j_false_scalar, "false scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto pred_vec = reinterpret_cast<cudf::column_view *>(j_pred_vec);
-    auto true_vec = reinterpret_cast<cudf::column_view *>(j_true_vec);
-    auto false_scalar = reinterpret_cast<cudf::scalar *>(j_false_scalar);
+    auto pred_vec     = reinterpret_cast<cudf::column_view*>(j_pred_vec);
+    auto true_vec     = reinterpret_cast<cudf::column_view*>(j_true_vec);
+    auto false_scalar = reinterpret_cast<cudf::scalar*>(j_false_scalar);
     return release_as_jlong(cudf::copy_if_else(*true_vec, *false_scalar, *pred_vec));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseSV(JNIEnv *env, jclass,
-                                                                jlong j_pred_vec,
-                                                                jlong j_true_scalar,
-                                                                jlong j_false_vec) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseSV(
+  JNIEnv* env, jclass, jlong j_pred_vec, jlong j_true_scalar, jlong j_false_vec)
+{
   JNI_NULL_CHECK(env, j_pred_vec, "predicate column is null", 0);
   JNI_NULL_CHECK(env, j_true_scalar, "true scalar is null", 0);
   JNI_NULL_CHECK(env, j_false_vec, "false column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto pred_vec = reinterpret_cast<cudf::column_view *>(j_pred_vec);
-    auto true_scalar = reinterpret_cast<cudf::scalar *>(j_true_scalar);
-    auto false_vec = reinterpret_cast<cudf::column_view *>(j_false_vec);
+    auto pred_vec    = reinterpret_cast<cudf::column_view*>(j_pred_vec);
+    auto true_scalar = reinterpret_cast<cudf::scalar*>(j_true_scalar);
+    auto false_vec   = reinterpret_cast<cudf::column_view*>(j_false_vec);
     return release_as_jlong(cudf::copy_if_else(*true_scalar, *false_vec, *pred_vec));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseSS(JNIEnv *env, jclass,
-                                                                jlong j_pred_vec,
-                                                                jlong j_true_scalar,
-                                                                jlong j_false_scalar) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseSS(
+  JNIEnv* env, jclass, jlong j_pred_vec, jlong j_true_scalar, jlong j_false_scalar)
+{
   JNI_NULL_CHECK(env, j_pred_vec, "predicate column is null", 0);
   JNI_NULL_CHECK(env, j_true_scalar, "true scalar is null", 0);
   JNI_NULL_CHECK(env, j_false_scalar, "false scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto pred_vec = reinterpret_cast<cudf::column_view *>(j_pred_vec);
-    auto true_scalar = reinterpret_cast<cudf::scalar *>(j_true_scalar);
-    auto false_scalar = reinterpret_cast<cudf::scalar *>(j_false_scalar);
+    auto pred_vec     = reinterpret_cast<cudf::column_view*>(j_pred_vec);
+    auto true_scalar  = reinterpret_cast<cudf::scalar*>(j_true_scalar);
+    auto false_scalar = reinterpret_cast<cudf::scalar*>(j_false_scalar);
     return release_as_jlong(cudf::copy_if_else(*true_scalar, *false_scalar, *pred_vec));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getElement(JNIEnv *env, jclass, jlong from,
-                                                                  jint index) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getElement(JNIEnv* env,
+                                                                  jclass,
+                                                                  jlong from,
+                                                                  jint index)
+{
   JNI_NULL_CHECK(env, from, "from column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto from_vec = reinterpret_cast<cudf::column_view *>(from);
+    auto from_vec = reinterpret_cast<cudf::column_view*>(from);
     return release_as_jlong(cudf::get_element(*from_vec, index));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_reduce(JNIEnv *env, jclass, jlong j_col_view,
-                                                              jlong j_agg, jint j_dtype,
-                                                              jint scale) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_reduce(
+  JNIEnv* env, jclass, jlong j_col_view, jlong j_agg, jint j_dtype, jint scale)
+{
   JNI_NULL_CHECK(env, j_col_view, "column view is null", 0);
   JNI_NULL_CHECK(env, j_agg, "aggregation is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto col = reinterpret_cast<cudf::column_view *>(j_col_view);
-    auto agg = reinterpret_cast<cudf::aggregation *>(j_agg);
+    auto col                  = reinterpret_cast<cudf::column_view*>(j_col_view);
+    auto agg                  = reinterpret_cast<cudf::aggregation*>(j_agg);
     cudf::data_type out_dtype = cudf::jni::make_data_type(j_dtype, scale);
     return release_as_jlong(
-        cudf::reduce(*col, *dynamic_cast<cudf::reduce_aggregation *>(agg), out_dtype));
+      cudf::reduce(*col, *dynamic_cast<cudf::reduce_aggregation*>(agg), out_dtype));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_segmentedReduce(
-    JNIEnv *env, jclass, jlong j_data_view, jlong j_offsets_view, jlong j_agg,
-    jboolean include_nulls, jint j_dtype, jint scale) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_segmentedReduce(JNIEnv* env,
+                                                                       jclass,
+                                                                       jlong j_data_view,
+                                                                       jlong j_offsets_view,
+                                                                       jlong j_agg,
+                                                                       jboolean include_nulls,
+                                                                       jint j_dtype,
+                                                                       jint scale)
+{
   JNI_NULL_CHECK(env, j_data_view, "data column view is null", 0);
   JNI_NULL_CHECK(env, j_offsets_view, "offsets column view is null", 0);
   JNI_NULL_CHECK(env, j_agg, "aggregation is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto data = reinterpret_cast<cudf::column_view *>(j_data_view);
-    auto offsets = reinterpret_cast<cudf::column_view *>(j_offsets_view);
-    auto agg = reinterpret_cast<cudf::aggregation *>(j_agg);
-    auto s_agg = dynamic_cast<cudf::segmented_reduce_aggregation *>(agg);
+    auto data    = reinterpret_cast<cudf::column_view*>(j_data_view);
+    auto offsets = reinterpret_cast<cudf::column_view*>(j_offsets_view);
+    auto agg     = reinterpret_cast<cudf::aggregation*>(j_agg);
+    auto s_agg   = dynamic_cast<cudf::segmented_reduce_aggregation*>(agg);
     JNI_ARG_CHECK(env, s_agg != nullptr, "agg is not a cudf::segmented_reduce_aggregation", 0)
     auto null_policy = include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
     cudf::data_type out_dtype = cudf::jni::make_data_type(j_dtype, scale);
     return release_as_jlong(
-        cudf::segmented_reduce(*data, *offsets, *s_agg, out_dtype, null_policy));
+      cudf::segmented_reduce(*data, *offsets, *s_agg, out_dtype, null_policy));
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_segmentedGather(
-    JNIEnv *env, jclass, jlong source_column, jlong gather_map_list, jboolean nullify_out_bounds) {
+  JNIEnv* env, jclass, jlong source_column, jlong gather_map_list, jboolean nullify_out_bounds)
+{
   JNI_NULL_CHECK(env, source_column, "source column view is null", 0);
   JNI_NULL_CHECK(env, gather_map_list, "gather map is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const &src_col =
-        cudf::lists_column_view(*reinterpret_cast<cudf::column_view *>(source_column));
-    auto const &gather_map =
-        cudf::lists_column_view(*reinterpret_cast<cudf::column_view *>(gather_map_list));
-    auto out_bounds_policy = nullify_out_bounds ? cudf::out_of_bounds_policy::NULLIFY :
-                                                  cudf::out_of_bounds_policy::DONT_CHECK;
+    auto const& src_col =
+      cudf::lists_column_view(*reinterpret_cast<cudf::column_view*>(source_column));
+    auto const& gather_map =
+      cudf::lists_column_view(*reinterpret_cast<cudf::column_view*>(gather_map_list));
+    auto out_bounds_policy = nullify_out_bounds ? cudf::out_of_bounds_policy::NULLIFY
+                                                : cudf::out_of_bounds_policy::DONT_CHECK;
     return release_as_jlong(cudf::lists::segmented_gather(src_col, gather_map, out_bounds_policy));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_scan(JNIEnv *env, jclass, jlong j_col_view,
-                                                            jlong j_agg, jboolean is_inclusive,
-                                                            jboolean include_nulls) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_scan(
+  JNIEnv* env, jclass, jlong j_col_view, jlong j_agg, jboolean is_inclusive, jboolean include_nulls)
+{
   JNI_NULL_CHECK(env, j_col_view, "column view is null", 0);
   JNI_NULL_CHECK(env, j_agg, "aggregation is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto col = reinterpret_cast<cudf::column_view *>(j_col_view);
-    auto agg = reinterpret_cast<cudf::aggregation *>(j_agg);
-    auto scan_type = is_inclusive ? cudf::scan_type::INCLUSIVE : cudf::scan_type::EXCLUSIVE;
+    auto col         = reinterpret_cast<cudf::column_view*>(j_col_view);
+    auto agg         = reinterpret_cast<cudf::aggregation*>(j_agg);
+    auto scan_type   = is_inclusive ? cudf::scan_type::INCLUSIVE : cudf::scan_type::EXCLUSIVE;
     auto null_policy = include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
     return release_as_jlong(
-        cudf::scan(*col, *dynamic_cast<cudf::scan_aggregation *>(agg), scan_type, null_policy));
+      cudf::scan(*col, *dynamic_cast<cudf::scan_aggregation*>(agg), scan_type, null_policy));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_approxPercentile(JNIEnv *env, jclass clazz,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_approxPercentile(JNIEnv* env,
+                                                                        jclass clazz,
                                                                         jlong input_column,
-                                                                        jlong percentiles_column) {
+                                                                        jlong percentiles_column)
+{
   JNI_NULL_CHECK(env, input_column, "input_column native handle is null", 0);
   JNI_NULL_CHECK(env, percentiles_column, "percentiles_column native handle is null", 0);
   try {
@@ -366,63 +394,70 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_approxPercentile(JNIEnv *
     using tdigest_column_view = cudf::tdigest::tdigest_column_view;
     jni::auto_set_device(env);
     auto const tdigest_view =
-        tdigest_column_view{structs_column_view{*reinterpret_cast<column_view *>(input_column)}};
-    auto const p_percentiles = reinterpret_cast<column_view *>(percentiles_column);
+      tdigest_column_view{structs_column_view{*reinterpret_cast<column_view*>(input_column)}};
+    auto const p_percentiles = reinterpret_cast<column_view*>(percentiles_column);
     return release_as_jlong(percentile_approx(tdigest_view, *p_percentiles));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_quantile(JNIEnv *env, jclass clazz,
-                                                                jlong input_column,
-                                                                jint quantile_method,
-                                                                jdoubleArray jquantiles) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_quantile(
+  JNIEnv* env, jclass clazz, jlong input_column, jint quantile_method, jdoubleArray jquantiles)
+{
   JNI_NULL_CHECK(env, input_column, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jdoubleArray native_quantiles(env, jquantiles);
     std::vector<double> quantiles(native_quantiles.data(),
                                   native_quantiles.data() + native_quantiles.size());
-    cudf::column_view *n_input_column = reinterpret_cast<cudf::column_view *>(input_column);
+    cudf::column_view* n_input_column     = reinterpret_cast<cudf::column_view*>(input_column);
     cudf::interpolation n_quantile_method = static_cast<cudf::interpolation>(quantile_method);
     return release_as_jlong(cudf::quantile(*n_input_column, quantiles, n_quantile_method));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_rollingWindow(
-    JNIEnv *env, jclass clazz, jlong input_col, jlong default_output_col, jint min_periods,
-    jlong agg_ptr, jint preceding, jint following, jlong preceding_col, jlong following_col) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_rollingWindow(JNIEnv* env,
+                                                                     jclass clazz,
+                                                                     jlong input_col,
+                                                                     jlong default_output_col,
+                                                                     jint min_periods,
+                                                                     jlong agg_ptr,
+                                                                     jint preceding,
+                                                                     jint following,
+                                                                     jlong preceding_col,
+                                                                     jlong following_col)
+{
   JNI_NULL_CHECK(env, input_col, "native handle is null", 0);
   JNI_NULL_CHECK(env, agg_ptr, "aggregation handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *n_input_col = reinterpret_cast<cudf::column_view *>(input_col);
-    cudf::column_view *n_default_output_col =
-        reinterpret_cast<cudf::column_view *>(default_output_col);
-    cudf::column_view *n_preceding_col = reinterpret_cast<cudf::column_view *>(preceding_col);
-    cudf::column_view *n_following_col = reinterpret_cast<cudf::column_view *>(following_col);
-    cudf::rolling_aggregation *agg =
-        dynamic_cast<cudf::rolling_aggregation *>(reinterpret_cast<cudf::aggregation *>(agg_ptr));
+    cudf::column_view* n_input_col = reinterpret_cast<cudf::column_view*>(input_col);
+    cudf::column_view* n_default_output_col =
+      reinterpret_cast<cudf::column_view*>(default_output_col);
+    cudf::column_view* n_preceding_col = reinterpret_cast<cudf::column_view*>(preceding_col);
+    cudf::column_view* n_following_col = reinterpret_cast<cudf::column_view*>(following_col);
+    cudf::rolling_aggregation* agg =
+      dynamic_cast<cudf::rolling_aggregation*>(reinterpret_cast<cudf::aggregation*>(agg_ptr));
     JNI_ARG_CHECK(env, agg != nullptr, "aggregation is not an instance of rolling_aggregation", 0);
 
     std::unique_ptr<cudf::column> ret;
     if (n_default_output_col != nullptr) {
       if (n_preceding_col != nullptr && n_following_col != nullptr) {
-        CUDF_FAIL("A default output column is not currently supported with variable length "
-                  "preceding and following");
+        CUDF_FAIL(
+          "A default output column is not currently supported with variable length "
+          "preceding and following");
         // ret = cudf::rolling_window(*n_input_col, *n_default_output_col,
         //        *n_preceding_col, *n_following_col, min_periods, agg);
       } else {
-        ret = cudf::rolling_window(*n_input_col, *n_default_output_col, preceding, following,
-                                   min_periods, *agg);
+        ret = cudf::rolling_window(
+          *n_input_col, *n_default_output_col, preceding, following, min_periods, *agg);
       }
 
     } else {
       if (n_preceding_col != nullptr && n_following_col != nullptr) {
-        ret = cudf::rolling_window(*n_input_col, *n_preceding_col, *n_following_col, min_periods,
-                                   *agg);
+        ret =
+          cudf::rolling_window(*n_input_col, *n_preceding_col, *n_following_col, min_periods, *agg);
       } else {
         ret = cudf::rolling_window(*n_input_col, preceding, following, min_periods, *agg);
       }
@@ -432,301 +467,336 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_rollingWindow(
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_slice(JNIEnv *env, jclass clazz,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_slice(JNIEnv* env,
+                                                                  jclass clazz,
                                                                   jlong input_column,
-                                                                  jintArray slice_indices) {
+                                                                  jintArray slice_indices)
+{
   JNI_NULL_CHECK(env, input_column, "native handle is null", 0);
   JNI_NULL_CHECK(env, slice_indices, "slice indices are null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *n_column = reinterpret_cast<cudf::column_view *>(input_column);
+    cudf::column_view* n_column = reinterpret_cast<cudf::column_view*>(input_column);
     cudf::jni::native_jintArray n_slice_indices(env, slice_indices);
     std::vector<cudf::size_type> indices(n_slice_indices.begin(), n_slice_indices.end());
 
     std::vector<cudf::column_view> result = cudf::slice(*n_column, indices);
     cudf::jni::native_jlongArray n_result(env, result.size());
 
-    std::transform(result.begin(), result.end(), n_result.begin(),
-                   [](cudf::column_view const &result_col) {
-                     return ptr_as_jlong(new cudf::column{result_col});
-                   });
+    std::transform(
+      result.begin(), result.end(), n_result.begin(), [](cudf::column_view const& result_col) {
+        return ptr_as_jlong(new cudf::column{result_col});
+      });
 
     return n_result.get_jArray();
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractListElement(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractListElement(JNIEnv* env,
+                                                                          jclass,
                                                                           jlong column_view,
-                                                                          jint index) {
+                                                                          jint index)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::lists_column_view lcv(*cv);
     return release_as_jlong(cudf::lists::extract_list_element(lcv, index));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractListElementV(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractListElementV(JNIEnv* env,
+                                                                           jclass,
                                                                            jlong column_view,
-                                                                           jlong indices_view) {
+                                                                           jlong indices_view)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, indices_view, "indices is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *indices = reinterpret_cast<cudf::column_view *>(indices_view);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* indices = reinterpret_cast<cudf::column_view*>(indices_view);
+    cudf::column_view* cv      = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::lists_column_view lcv(*cv);
     return release_as_jlong(cudf::lists::extract_list_element(lcv, *indices));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dropListDuplicates(JNIEnv *env, jclass,
-                                                                          jlong column_view) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dropListDuplicates(JNIEnv* env,
+                                                                          jclass,
+                                                                          jlong column_view)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input_cv = reinterpret_cast<cudf::column_view const *>(column_view);
+    auto const input_cv = reinterpret_cast<cudf::column_view const*>(column_view);
     return release_as_jlong(cudf::lists::distinct(cudf::lists_column_view{*input_cv}));
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dropListDuplicatesWithKeysValues(
-    JNIEnv *env, jclass, jlong keys_vals_handle) {
+  JNIEnv* env, jclass, jlong keys_vals_handle)
+{
   JNI_NULL_CHECK(env, keys_vals_handle, "keys_vals_handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input_cv = reinterpret_cast<cudf::column_view const *>(keys_vals_handle);
-    JNI_ARG_CHECK(env, input_cv->type().id() == cudf::type_id::LIST,
-                  "Input column is not a lists column.", 0);
+    auto const input_cv = reinterpret_cast<cudf::column_view const*>(keys_vals_handle);
+    JNI_ARG_CHECK(
+      env, input_cv->type().id() == cudf::type_id::LIST, "Input column is not a lists column.", 0);
 
     auto const lists_keys_vals = cudf::lists_column_view(*input_cv);
-    auto const keys_vals = lists_keys_vals.child();
-    JNI_ARG_CHECK(env, keys_vals.type().id() == cudf::type_id::STRUCT,
-                  "Input column has child that is not a structs column.", 0);
-    JNI_ARG_CHECK(env, keys_vals.num_children() == 2,
-                  "Input column has child that does not have 2 children.", 0);
+    auto const keys_vals       = lists_keys_vals.child();
+    JNI_ARG_CHECK(env,
+                  keys_vals.type().id() == cudf::type_id::STRUCT,
+                  "Input column has child that is not a structs column.",
+                  0);
+    JNI_ARG_CHECK(env,
+                  keys_vals.num_children() == 2,
+                  "Input column has child that does not have 2 children.",
+                  0);
 
     return release_as_jlong(
-        cudf::jni::lists_distinct_by_key(lists_keys_vals, cudf::get_default_stream()));
+      cudf::jni::lists_distinct_by_key(lists_keys_vals, cudf::get_default_stream()));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_flattenLists(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_flattenLists(JNIEnv* env,
+                                                                    jclass,
                                                                     jlong input_handle,
-                                                                    jboolean ignore_null) {
+                                                                    jboolean ignore_null)
+{
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const null_policy = ignore_null ? cudf::lists::concatenate_null_policy::IGNORE :
-                                           cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW;
-    auto const input_cv = reinterpret_cast<cudf::column_view const *>(input_handle);
+    auto const null_policy = ignore_null ? cudf::lists::concatenate_null_policy::IGNORE
+                                         : cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW;
+    auto const input_cv    = reinterpret_cast<cudf::column_view const*>(input_handle);
     return release_as_jlong(cudf::lists::concatenate_list_elements(*input_cv, null_policy));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContains(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContains(JNIEnv* env,
+                                                                    jclass,
                                                                     jlong column_view,
-                                                                    jlong lookup_key) {
+                                                                    jlong lookup_key)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, lookup_key, "lookup scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::lists_column_view lcv(*cv);
-    cudf::scalar *lookup_scalar = reinterpret_cast<cudf::scalar *>(lookup_key);
+    cudf::scalar* lookup_scalar = reinterpret_cast<cudf::scalar*>(lookup_key);
     return release_as_jlong(cudf::lists::contains(lcv, *lookup_scalar));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContainsNulls(JNIEnv *env, jclass,
-                                                                         jlong column_view) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContainsNulls(JNIEnv* env,
+                                                                         jclass,
+                                                                         jlong column_view)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto cv = reinterpret_cast<cudf::column_view *>(column_view);
+    auto cv  = reinterpret_cast<cudf::column_view*>(column_view);
     auto lcv = cudf::lists_column_view{*cv};
     return release_as_jlong(cudf::lists::contains_nulls(lcv));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContainsColumn(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContainsColumn(JNIEnv* env,
+                                                                          jclass,
                                                                           jlong column_view,
-                                                                          jlong lookup_key_cv) {
+                                                                          jlong lookup_key_cv)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, lookup_key_cv, "lookup column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::lists_column_view lcv(*cv);
-    cudf::column_view *lookup_cv = reinterpret_cast<cudf::column_view *>(lookup_key_cv);
+    cudf::column_view* lookup_cv = reinterpret_cast<cudf::column_view*>(lookup_key_cv);
     return release_as_jlong(cudf::lists::contains(lcv, *lookup_cv));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listIndexOfScalar(JNIEnv *env, jclass,
-                                                                         jlong column_view,
-                                                                         jlong lookup_key,
-                                                                         jboolean is_find_first) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listIndexOfScalar(
+  JNIEnv* env, jclass, jlong column_view, jlong lookup_key, jboolean is_find_first)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, lookup_key, "lookup scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const cv = reinterpret_cast<cudf::column_view const *>(column_view);
-    auto const lcv = cudf::lists_column_view{*cv};
-    auto const lookup_key_scalar = reinterpret_cast<cudf::scalar const *>(lookup_key);
-    auto const find_option = is_find_first ? cudf::lists::duplicate_find_option::FIND_FIRST :
-                                             cudf::lists::duplicate_find_option::FIND_LAST;
+    auto const cv                = reinterpret_cast<cudf::column_view const*>(column_view);
+    auto const lcv               = cudf::lists_column_view{*cv};
+    auto const lookup_key_scalar = reinterpret_cast<cudf::scalar const*>(lookup_key);
+    auto const find_option       = is_find_first ? cudf::lists::duplicate_find_option::FIND_FIRST
+                                                 : cudf::lists::duplicate_find_option::FIND_LAST;
     return release_as_jlong(cudf::lists::index_of(lcv, *lookup_key_scalar, find_option));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listIndexOfColumn(JNIEnv *env, jclass,
-                                                                         jlong column_view,
-                                                                         jlong lookup_keys,
-                                                                         jboolean is_find_first) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listIndexOfColumn(
+  JNIEnv* env, jclass, jlong column_view, jlong lookup_keys, jboolean is_find_first)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, lookup_keys, "lookup key column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const cv = reinterpret_cast<cudf::column_view const *>(column_view);
-    auto const lcv = cudf::lists_column_view{*cv};
-    auto const lookup_key_column = reinterpret_cast<cudf::column_view const *>(lookup_keys);
-    auto const find_option = is_find_first ? cudf::lists::duplicate_find_option::FIND_FIRST :
-                                             cudf::lists::duplicate_find_option::FIND_LAST;
+    auto const cv                = reinterpret_cast<cudf::column_view const*>(column_view);
+    auto const lcv               = cudf::lists_column_view{*cv};
+    auto const lookup_key_column = reinterpret_cast<cudf::column_view const*>(lookup_keys);
+    auto const find_option       = is_find_first ? cudf::lists::duplicate_find_option::FIND_FIRST
+                                                 : cudf::lists::duplicate_find_option::FIND_LAST;
     return release_as_jlong(cudf::lists::index_of(lcv, *lookup_key_column, find_option));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env, jclass,
-                                                                    jlong column_view,
-                                                                    jboolean is_descending,
-                                                                    jboolean is_null_smallest) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(
+  JNIEnv* env, jclass, jlong column_view, jboolean is_descending, jboolean is_null_smallest)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
     auto sort_order = is_descending ? cudf::order::DESCENDING : cudf::order::ASCENDING;
     auto null_order = is_null_smallest ? cudf::null_order::BEFORE : cudf::null_order::AFTER;
-    auto *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    auto* cv        = reinterpret_cast<cudf::column_view*>(column_view);
     return release_as_jlong(
-        cudf::lists::sort_lists(cudf::lists_column_view(*cv), sort_order, null_order));
+      cudf::lists::sort_lists(cudf::lists_column_view(*cv), sort_order, null_order));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_generateListOffsets(JNIEnv *env, jclass,
-                                                                           jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_generateListOffsets(JNIEnv* env,
+                                                                           jclass,
+                                                                           jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "handle is null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto const cv = reinterpret_cast<cudf::column_view const *>(handle);
+    auto const cv = reinterpret_cast<cudf::column_view const*>(handle);
     return release_as_jlong(cudf::jni::generate_list_offsets(*cv));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listsHaveOverlap(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listsHaveOverlap(JNIEnv* env,
+                                                                        jclass,
                                                                         jlong lhs_handle,
-                                                                        jlong rhs_handle) {
+                                                                        jlong rhs_handle)
+{
   JNI_NULL_CHECK(env, lhs_handle, "lhs_handle is null", 0)
   JNI_NULL_CHECK(env, rhs_handle, "rhs_handle is null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto const lhs = reinterpret_cast<cudf::column_view const *>(lhs_handle);
-    auto const rhs = reinterpret_cast<cudf::column_view const *>(rhs_handle);
-    auto overlap_result =
-        cudf::lists::have_overlap(cudf::lists_column_view{*lhs}, cudf::lists_column_view{*rhs},
-                                  cudf::null_equality::UNEQUAL, cudf::nan_equality::ALL_EQUAL);
+    auto const lhs      = reinterpret_cast<cudf::column_view const*>(lhs_handle);
+    auto const rhs      = reinterpret_cast<cudf::column_view const*>(rhs_handle);
+    auto overlap_result = cudf::lists::have_overlap(cudf::lists_column_view{*lhs},
+                                                    cudf::lists_column_view{*rhs},
+                                                    cudf::null_equality::UNEQUAL,
+                                                    cudf::nan_equality::ALL_EQUAL);
     cudf::jni::post_process_list_overlap(*lhs, *rhs, overlap_result);
     return release_as_jlong(overlap_result);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listsIntersectDistinct(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listsIntersectDistinct(JNIEnv* env,
+                                                                              jclass,
                                                                               jlong lhs_handle,
-                                                                              jlong rhs_handle) {
+                                                                              jlong rhs_handle)
+{
   JNI_NULL_CHECK(env, lhs_handle, "lhs_handle is null", 0)
   JNI_NULL_CHECK(env, rhs_handle, "rhs_handle is null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto const lhs = reinterpret_cast<cudf::column_view const *>(lhs_handle);
-    auto const rhs = reinterpret_cast<cudf::column_view const *>(rhs_handle);
-    return release_as_jlong(cudf::lists::intersect_distinct(
-        cudf::lists_column_view{*lhs}, cudf::lists_column_view{*rhs}, cudf::null_equality::EQUAL,
-        cudf::nan_equality::ALL_EQUAL));
+    auto const lhs = reinterpret_cast<cudf::column_view const*>(lhs_handle);
+    auto const rhs = reinterpret_cast<cudf::column_view const*>(rhs_handle);
+    return release_as_jlong(cudf::lists::intersect_distinct(cudf::lists_column_view{*lhs},
+                                                            cudf::lists_column_view{*rhs},
+                                                            cudf::null_equality::EQUAL,
+                                                            cudf::nan_equality::ALL_EQUAL));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listsUnionDistinct(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listsUnionDistinct(JNIEnv* env,
+                                                                          jclass,
                                                                           jlong lhs_handle,
-                                                                          jlong rhs_handle) {
+                                                                          jlong rhs_handle)
+{
   JNI_NULL_CHECK(env, lhs_handle, "lhs_handle is null", 0)
   JNI_NULL_CHECK(env, rhs_handle, "rhs_handle is null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto const lhs = reinterpret_cast<cudf::column_view const *>(lhs_handle);
-    auto const rhs = reinterpret_cast<cudf::column_view const *>(rhs_handle);
-    return release_as_jlong(
-        cudf::lists::union_distinct(cudf::lists_column_view{*lhs}, cudf::lists_column_view{*rhs},
-                                    cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL));
+    auto const lhs = reinterpret_cast<cudf::column_view const*>(lhs_handle);
+    auto const rhs = reinterpret_cast<cudf::column_view const*>(rhs_handle);
+    return release_as_jlong(cudf::lists::union_distinct(cudf::lists_column_view{*lhs},
+                                                        cudf::lists_column_view{*rhs},
+                                                        cudf::null_equality::EQUAL,
+                                                        cudf::nan_equality::ALL_EQUAL));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listsDifferenceDistinct(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listsDifferenceDistinct(JNIEnv* env,
+                                                                               jclass,
                                                                                jlong lhs_handle,
-                                                                               jlong rhs_handle) {
+                                                                               jlong rhs_handle)
+{
   JNI_NULL_CHECK(env, lhs_handle, "lhs_handle is null", 0)
   JNI_NULL_CHECK(env, rhs_handle, "rhs_handle is null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto const lhs = reinterpret_cast<cudf::column_view const *>(lhs_handle);
-    auto const rhs = reinterpret_cast<cudf::column_view const *>(rhs_handle);
-    return release_as_jlong(cudf::lists::difference_distinct(
-        cudf::lists_column_view{*lhs}, cudf::lists_column_view{*rhs}, cudf::null_equality::EQUAL,
-        cudf::nan_equality::ALL_EQUAL));
+    auto const lhs = reinterpret_cast<cudf::column_view const*>(lhs_handle);
+    auto const rhs = reinterpret_cast<cudf::column_view const*>(rhs_handle);
+    return release_as_jlong(cudf::lists::difference_distinct(cudf::lists_column_view{*lhs},
+                                                             cudf::lists_column_view{*rhs},
+                                                             cudf::null_equality::EQUAL,
+                                                             cudf::nan_equality::ALL_EQUAL));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_reverseStringsOrLists(JNIEnv *env, jclass,
-                                                                             jlong input_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_reverseStringsOrLists(JNIEnv* env,
+                                                                             jclass,
+                                                                             jlong input_handle)
+{
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0)
   try {
     cudf::jni::auto_set_device(env);
 
-    auto const input = reinterpret_cast<cudf::column_view const *>(input_handle);
+    auto const input = reinterpret_cast<cudf::column_view const*>(input_handle);
     switch (input->type().id()) {
       case cudf::type_id::STRING:
         return release_as_jlong(cudf::strings::reverse(cudf::strings_column_view{*input}));
       case cudf::type_id::LIST:
         return release_as_jlong(cudf::lists::reverse(cudf::lists_column_view{*input}));
       default:
-        JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                      "A column of type string or list is required for reverse()", 0);
+        JNI_THROW_NEW(env,
+                      "java/lang/IllegalArgumentException",
+                      "A column of type string or list is required for reverse()",
+                      0);
     }
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass,
-                                                                        jlong input_handle,
-                                                                        jstring delimiter_obj,
-                                                                        jint limit) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(
+  JNIEnv* env, jclass, jlong input_handle, jstring delimiter_obj, jint limit)
+{
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
 
   if (limit == 0 || limit == 1) {
@@ -734,26 +804,31 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *
     // This is because cudf operates on a different parameter (`max_split`) which is converted from
     // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an
     // unlimited split.
-    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                  "limit == 0 and limit == 1 are not supported", 0);
+    JNI_THROW_NEW(
+      env, "java/lang/IllegalArgumentException", "limit == 0 and limit == 1 are not supported", 0);
   }
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::column_view const *>(input_handle);
+    auto const input          = reinterpret_cast<cudf::column_view const*>(input_handle);
     auto const strings_column = cudf::strings_column_view{*input};
     auto const delimiter_jstr = cudf::jni::native_jstring(env, delimiter_obj);
-    auto const delimiter = std::string(delimiter_jstr.get(), delimiter_jstr.size_bytes());
-    auto const max_split = limit > 1 ? limit - 1 : limit;
+    auto const delimiter      = std::string(delimiter_jstr.get(), delimiter_jstr.size_bytes());
+    auto const max_split      = limit > 1 ? limit - 1 : limit;
     auto result = cudf::strings::split(strings_column, cudf::string_scalar{delimiter}, max_split);
     return cudf::jni::convert_table_for_return(env, std::move(result));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRe(
-    JNIEnv *env, jclass, jlong input_handle, jstring pattern_obj, jint regex_flags,
-    jint capture_groups, jint limit) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRe(JNIEnv* env,
+                                                                          jclass,
+                                                                          jlong input_handle,
+                                                                          jstring pattern_obj,
+                                                                          jint regex_flags,
+                                                                          jint capture_groups,
+                                                                          jint limit)
+{
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
 
   if (limit == 0 || limit == 1) {
@@ -761,30 +836,29 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRe(
     // This is because cudf operates on a different parameter (`max_split`) which is converted from
     // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an
     // unlimited split.
-    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                  "limit == 0 and limit == 1 are not supported", 0);
+    JNI_THROW_NEW(
+      env, "java/lang/IllegalArgumentException", "limit == 0 and limit == 1 are not supported", 0);
   }
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::column_view const *>(input_handle);
+    auto const input          = reinterpret_cast<cudf::column_view const*>(input_handle);
     auto const strings_column = cudf::strings_column_view{*input};
-    auto const pattern_jstr = cudf::jni::native_jstring(env, pattern_obj);
-    auto const pattern = std::string(pattern_jstr.get(), pattern_jstr.size_bytes());
-    auto const max_split = limit > 1 ? limit - 1 : limit;
-    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
-    auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
-    auto const regex_prog = cudf::strings::regex_program::create(pattern, flags, groups);
-    auto result = cudf::strings::split_re(strings_column, *regex_prog, max_split);
+    auto const pattern_jstr   = cudf::jni::native_jstring(env, pattern_obj);
+    auto const pattern        = std::string(pattern_jstr.get(), pattern_jstr.size_bytes());
+    auto const max_split      = limit > 1 ? limit - 1 : limit;
+    auto const flags          = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const groups         = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog     = cudf::strings::regex_program::create(pattern, flags, groups);
+    auto result               = cudf::strings::split_re(strings_column, *regex_prog, max_split);
     return cudf::jni::convert_table_for_return(env, std::move(result));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv *env, jclass,
-                                                                         jlong input_handle,
-                                                                         jstring delimiter_obj,
-                                                                         jint limit) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(
+  JNIEnv* env, jclass, jlong input_handle, jstring delimiter_obj, jint limit)
+{
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
 
   if (limit == 0 || limit == 1) {
@@ -792,27 +866,32 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv
     // This is because cudf operates on a different parameter (`max_split`) which is converted from
     // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an
     // unlimited split.
-    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                  "limit == 0 and limit == 1 are not supported", 0);
+    JNI_THROW_NEW(
+      env, "java/lang/IllegalArgumentException", "limit == 0 and limit == 1 are not supported", 0);
   }
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::column_view const *>(input_handle);
+    auto const input          = reinterpret_cast<cudf::column_view const*>(input_handle);
     auto const strings_column = cudf::strings_column_view{*input};
     auto const delimiter_jstr = cudf::jni::native_jstring(env, delimiter_obj);
-    auto const delimiter = std::string(delimiter_jstr.get(), delimiter_jstr.size_bytes());
-    auto const max_split = limit > 1 ? limit - 1 : limit;
+    auto const delimiter      = std::string(delimiter_jstr.get(), delimiter_jstr.size_bytes());
+    auto const max_split      = limit > 1 ? limit - 1 : limit;
     auto result =
-        cudf::strings::split_record(strings_column, cudf::string_scalar{delimiter}, max_split);
+      cudf::strings::split_record(strings_column, cudf::string_scalar{delimiter}, max_split);
     return release_as_jlong(result);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecordRe(
-    JNIEnv *env, jclass, jlong input_handle, jstring pattern_obj, jint regex_flags,
-    jint capture_groups, jint limit) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecordRe(JNIEnv* env,
+                                                                           jclass,
+                                                                           jlong input_handle,
+                                                                           jstring pattern_obj,
+                                                                           jint regex_flags,
+                                                                           jint capture_groups,
+                                                                           jint limit)
+{
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
 
   if (limit == 0 || limit == 1) {
@@ -820,99 +899,108 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecordRe(
     // This is because cudf operates on a different parameter (`max_split`) which is converted from
     // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an
     // unlimited split.
-    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                  "limit == 0 and limit == 1 are not supported", 0);
+    JNI_THROW_NEW(
+      env, "java/lang/IllegalArgumentException", "limit == 0 and limit == 1 are not supported", 0);
   }
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::column_view const *>(input_handle);
+    auto const input          = reinterpret_cast<cudf::column_view const*>(input_handle);
     auto const strings_column = cudf::strings_column_view{*input};
-    auto const pattern_jstr = cudf::jni::native_jstring(env, pattern_obj);
-    auto const pattern = std::string(pattern_jstr.get(), pattern_jstr.size_bytes());
-    auto const max_split = limit > 1 ? limit - 1 : limit;
-    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
-    auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
-    auto const regex_prog = cudf::strings::regex_program::create(pattern, flags, groups);
+    auto const pattern_jstr   = cudf::jni::native_jstring(env, pattern_obj);
+    auto const pattern        = std::string(pattern_jstr.get(), pattern_jstr.size_bytes());
+    auto const max_split      = limit > 1 ? limit - 1 : limit;
+    auto const flags          = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const groups         = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog     = cudf::strings::regex_program::create(pattern, flags, groups);
     auto result = cudf::strings::split_record_re(strings_column, *regex_prog, max_split);
     return release_as_jlong(result);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_split(JNIEnv *env, jclass clazz,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_split(JNIEnv* env,
+                                                                  jclass clazz,
                                                                   jlong input_column,
-                                                                  jintArray split_indices) {
+                                                                  jintArray split_indices)
+{
   JNI_NULL_CHECK(env, input_column, "native handle is null", 0);
   JNI_NULL_CHECK(env, split_indices, "split indices are null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *n_column = reinterpret_cast<cudf::column_view *>(input_column);
+    cudf::column_view* n_column = reinterpret_cast<cudf::column_view*>(input_column);
     cudf::jni::native_jintArray n_split_indices(env, split_indices);
     std::vector<cudf::size_type> indices(n_split_indices.begin(), n_split_indices.end());
 
     std::vector<cudf::column_view> result = cudf::split(*n_column, indices);
     cudf::jni::native_jlongArray n_result(env, result.size());
 
-    std::transform(result.begin(), result.end(), n_result.begin(),
-                   [](cudf::column_view const &result_col) {
-                     return ptr_as_jlong(new cudf::column_view{result_col});
-                   });
+    std::transform(
+      result.begin(), result.end(), n_result.begin(), [](cudf::column_view const& result_col) {
+        return ptr_as_jlong(new cudf::column_view{result_col});
+      });
 
     return n_result.get_jArray();
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_countElements(JNIEnv *env, jclass clazz,
-                                                                     jlong view_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_countElements(JNIEnv* env,
+                                                                     jclass clazz,
+                                                                     jlong view_handle)
+{
   JNI_NULL_CHECK(env, view_handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *n_column = reinterpret_cast<cudf::column_view *>(view_handle);
+    cudf::column_view* n_column = reinterpret_cast<cudf::column_view*>(view_handle);
     return release_as_jlong(cudf::lists::count_elements(cudf::lists_column_view(*n_column)));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_charLengths(JNIEnv *env, jclass clazz,
-                                                                   jlong view_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_charLengths(JNIEnv* env,
+                                                                   jclass clazz,
+                                                                   jlong view_handle)
+{
   JNI_NULL_CHECK(env, view_handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *n_column = reinterpret_cast<cudf::column_view *>(view_handle);
+    cudf::column_view* n_column = reinterpret_cast<cudf::column_view*>(view_handle);
     return release_as_jlong(cudf::strings::count_characters(cudf::strings_column_view(*n_column)));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_byteCount(JNIEnv *env, jclass clazz,
-                                                                 jlong view_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_byteCount(JNIEnv* env,
+                                                                 jclass clazz,
+                                                                 jlong view_handle)
+{
   JNI_NULL_CHECK(env, view_handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *n_column = reinterpret_cast<cudf::column_view *>(view_handle);
+    cudf::column_view* n_column = reinterpret_cast<cudf::column_view*>(view_handle);
     return release_as_jlong(cudf::strings::count_bytes(cudf::strings_column_view(*n_column)));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_codePoints(JNIEnv *env, jclass clazz,
-                                                                  jlong view_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_codePoints(JNIEnv* env,
+                                                                  jclass clazz,
+                                                                  jlong view_handle)
+{
   JNI_NULL_CHECK(env, view_handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::column_view const *>(view_handle);
+    auto const input = reinterpret_cast<cudf::column_view const*>(view_handle);
     return release_as_jlong(cudf::strings::code_points(cudf::strings_column_view{*input}));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_findAndReplaceAll(JNIEnv *env, jclass clazz,
-                                                                         jlong old_values_handle,
-                                                                         jlong new_values_handle,
-                                                                         jlong input_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_findAndReplaceAll(
+  JNIEnv* env, jclass clazz, jlong old_values_handle, jlong new_values_handle, jlong input_handle)
+{
   JNI_NULL_CHECK(env, old_values_handle, "values column is null", 0);
   JNI_NULL_CHECK(env, new_values_handle, "replace column is null", 0);
   JNI_NULL_CHECK(env, input_handle, "input column is null", 0);
@@ -922,230 +1010,253 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_findAndReplaceAll(JNIEnv
 
   try {
     cudf::jni::auto_set_device(env);
-    column_view *input_column = reinterpret_cast<column_view *>(input_handle);
-    column_view *old_values_column = reinterpret_cast<column_view *>(old_values_handle);
-    column_view *new_values_column = reinterpret_cast<column_view *>(new_values_handle);
+    column_view* input_column      = reinterpret_cast<column_view*>(input_handle);
+    column_view* old_values_column = reinterpret_cast<column_view*>(old_values_handle);
+    column_view* new_values_column = reinterpret_cast<column_view*>(new_values_handle);
     return release_as_jlong(
-        cudf::find_and_replace_all(*input_column, *old_values_column, *new_values_column));
+      cudf::find_and_replace_all(*input_column, *old_values_column, *new_values_column));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNullNative(JNIEnv *env, jclass,
-                                                                    jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNullNative(JNIEnv* env,
+                                                                    jclass,
+                                                                    jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(handle);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::is_null(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNotNullNative(JNIEnv *env, jclass,
-                                                                       jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNotNullNative(JNIEnv* env,
+                                                                       jclass,
+                                                                       jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(handle);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::is_valid(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNanNative(JNIEnv *env, jclass,
-                                                                   jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNanNative(JNIEnv* env,
+                                                                   jclass,
+                                                                   jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(handle);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::is_nan(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNotNanNative(JNIEnv *env, jclass,
-                                                                      jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNotNanNative(JNIEnv* env,
+                                                                      jclass,
+                                                                      jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(handle);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::is_not_nan(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_unaryOperation(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_unaryOperation(JNIEnv* env,
+                                                                      jclass,
                                                                       jlong input_ptr,
-                                                                      jint int_op) {
+                                                                      jint int_op)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
-    cudf::unary_operator op = static_cast<cudf::unary_operator>(int_op);
+    cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
+    cudf::unary_operator op  = static_cast<cudf::unary_operator>(int_op);
     return release_as_jlong(cudf::unary_operation(*input, op));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_round(JNIEnv *env, jclass, jlong input_ptr,
-                                                             jint decimal_places,
-                                                             jint rounding_method) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_round(
+  JNIEnv* env, jclass, jlong input_ptr, jint decimal_places, jint rounding_method)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    cudf::column_view* input     = reinterpret_cast<cudf::column_view*>(input_ptr);
     cudf::rounding_method method = static_cast<cudf::rounding_method>(rounding_method);
     return release_as_jlong(cudf::round(*input, decimal_places, method));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_year(JNIEnv *env, jclass, jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_year(JNIEnv* env, jclass, jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_year(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_month(JNIEnv *env, jclass, jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_month(JNIEnv* env, jclass, jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_month(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_day(JNIEnv *env, jclass, jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_day(JNIEnv* env, jclass, jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_day(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_hour(JNIEnv *env, jclass, jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_hour(JNIEnv* env, jclass, jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_hour(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_minute(JNIEnv *env, jclass,
-                                                              jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_minute(JNIEnv* env, jclass, jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_minute(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_second(JNIEnv *env, jclass,
-                                                              jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_second(JNIEnv* env, jclass, jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_second(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_weekDay(JNIEnv *env, jclass,
-                                                               jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_weekDay(JNIEnv* env, jclass, jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_weekday(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_lastDayOfMonth(JNIEnv *env, jclass,
-                                                                      jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_lastDayOfMonth(JNIEnv* env,
+                                                                      jclass,
+                                                                      jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::last_day_of_month(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dayOfYear(JNIEnv *env, jclass,
-                                                                 jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dayOfYear(JNIEnv* env,
+                                                                 jclass,
+                                                                 jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::day_of_year(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_quarterOfYear(JNIEnv *env, jclass,
-                                                                     jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_quarterOfYear(JNIEnv* env,
+                                                                     jclass,
+                                                                     jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_quarter(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_addCalendricalMonths(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_addCalendricalMonths(JNIEnv* env,
+                                                                            jclass,
                                                                             jlong ts_ptr,
-                                                                            jlong months_ptr) {
+                                                                            jlong months_ptr)
+{
   JNI_NULL_CHECK(env, ts_ptr, "ts is null", 0);
   JNI_NULL_CHECK(env, months_ptr, "months is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *ts = reinterpret_cast<cudf::column_view *>(ts_ptr);
-    const cudf::column_view *months = reinterpret_cast<cudf::column_view *>(months_ptr);
+    const cudf::column_view* ts     = reinterpret_cast<cudf::column_view*>(ts_ptr);
+    const cudf::column_view* months = reinterpret_cast<cudf::column_view*>(months_ptr);
     return release_as_jlong(cudf::datetime::add_calendrical_months(*ts, *months));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isLeapYear(JNIEnv *env, jclass,
-                                                                  jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isLeapYear(JNIEnv* env,
+                                                                  jclass,
+                                                                  jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::is_leap_year(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclass, jlong handle,
-                                                              jint type, jint scale) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv* env, jclass, jlong handle, jint type, jint scale)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column   = reinterpret_cast<cudf::column_view*>(handle);
     cudf::data_type n_data_type = cudf::jni::make_data_type(type, scale);
-    if (n_data_type == column->type()) {
-      return ptr_as_jlong(new cudf::column(*column));
-    }
+    if (n_data_type == column->type()) { return ptr_as_jlong(new cudf::column(*column)); }
     if (n_data_type.id() == cudf::type_id::STRING) {
       switch (column->type().id()) {
         case cudf::type_id::BOOL8: {
-          auto const true_scalar = cudf::string_scalar("true");
+          auto const true_scalar  = cudf::string_scalar("true");
           auto const false_scalar = cudf::string_scalar("false");
           return release_as_jlong(cudf::strings::from_booleans(*column, true_scalar, false_scalar));
         }
@@ -1195,26 +1306,30 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
       // "reinterpret" casting will be supported via https://github.com/rapidsai/cudf/pull/5358
       if (n_data_type.id() == cudf::type_id::TIMESTAMP_DAYS) {
         if (column->type().id() != cudf::type_id::INT32) {
-          JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                        "Numeric cast to TIMESTAMP_DAYS requires INT32", 0);
+          JNI_THROW_NEW(env,
+                        "java/lang/IllegalArgumentException",
+                        "Numeric cast to TIMESTAMP_DAYS requires INT32",
+                        0);
         }
       } else {
         if (column->type().id() != cudf::type_id::INT64) {
-          JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                        "Numeric cast to non-day timestamp requires INT64", 0);
+          JNI_THROW_NEW(env,
+                        "java/lang/IllegalArgumentException",
+                        "Numeric cast to non-day timestamp requires INT64",
+                        0);
         }
       }
-      cudf::data_type duration_type = cudf::jni::timestamp_to_duration(n_data_type);
+      cudf::data_type duration_type   = cudf::jni::timestamp_to_duration(n_data_type);
       cudf::column_view duration_view = cudf::column_view(
-          duration_type, column->size(), column->head(), column->null_mask(), column->null_count());
+        duration_type, column->size(), column->head(), column->null_mask(), column->null_count());
       return release_as_jlong(cudf::cast(duration_view, n_data_type));
     } else if (cudf::is_timestamp(column->type()) && cudf::is_numeric(n_data_type)) {
       // This is a temporary workaround to allow Java to cast from timestamp types to integral types
       // without forcing an intermediate duration column to be manifested.  Ultimately this style of
       // "reinterpret" casting will be supported via https://github.com/rapidsai/cudf/pull/5358
-      cudf::data_type duration_type = cudf::jni::timestamp_to_duration(column->type());
+      cudf::data_type duration_type   = cudf::jni::timestamp_to_duration(column->type());
       cudf::column_view duration_view = cudf::column_view(
-          duration_type, column->size(), column->head(), column->null_mask(), column->null_count());
+        duration_type, column->size(), column->head(), column->null_mask(), column->null_count());
       return release_as_jlong(cudf::cast(duration_view, n_data_type));
     } else {
       return release_as_jlong(cudf::cast(*column, n_data_type));
@@ -1223,25 +1338,28 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitCastTo(JNIEnv *env, jclass, jlong handle,
-                                                                 jint type, jint scale) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ColumnView_bitCastTo(JNIEnv* env, jclass, jlong handle, jint type, jint scale)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column   = reinterpret_cast<cudf::column_view*>(handle);
     cudf::data_type n_data_type = cudf::jni::make_data_type(type, scale);
     return ptr_as_jlong(new cudf::column_view{cudf::bit_cast(*column, n_data_type)});
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_byteListCast(JNIEnv *env, jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_byteListCast(JNIEnv* env,
+                                                                    jobject j_object,
                                                                     jlong handle,
-                                                                    jboolean endianness_config) {
+                                                                    jboolean endianness_config)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     cudf::flip_endianness config(static_cast<cudf::flip_endianness>(endianness_config));
     return release_as_jlong(byte_cast(*column, config));
   }
@@ -1249,78 +1367,86 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_byteListCast(JNIEnv *env,
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringTimestampToTimestamp(
-    JNIEnv *env, jobject j_object, jlong handle, jint time_unit, jstring formatObj) {
+  JNIEnv* env, jobject j_object, jlong handle, jint time_unit, jstring formatObj)
+{
   JNI_NULL_CHECK(env, handle, "column is null", 0);
   JNI_NULL_CHECK(env, formatObj, "format is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jstring format(env, formatObj);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     cudf::strings_column_view strings_column(*column);
 
     return release_as_jlong(cudf::strings::to_timestamps(
-        strings_column, cudf::data_type(static_cast<cudf::type_id>(time_unit)), format.get()));
+      strings_column, cudf::data_type(static_cast<cudf::type_id>(time_unit)), format.get()));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isTimestamp(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isTimestamp(JNIEnv* env,
+                                                                   jclass,
                                                                    jlong handle,
-                                                                   jstring formatObj) {
+                                                                   jstring formatObj)
+{
   JNI_NULL_CHECK(env, handle, "column is null", 0);
   JNI_NULL_CHECK(env, formatObj, "format is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jstring format(env, formatObj);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     cudf::strings_column_view strings_column(*column);
     return release_as_jlong(cudf::strings::is_timestamp(strings_column, format.get()));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_timestampToStringTimestamp(
-    JNIEnv *env, jobject j_object, jlong handle, jstring j_format) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_timestampToStringTimestamp(JNIEnv* env,
+                                                                                  jobject j_object,
+                                                                                  jlong handle,
+                                                                                  jstring j_format)
+{
   JNI_NULL_CHECK(env, handle, "column is null", 0);
   JNI_NULL_CHECK(env, j_format, "format is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jstring format(env, j_format);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::strings::from_timestamps(*column, format.get()));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ColumnView_containsScalar(JNIEnv *env,
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ColumnView_containsScalar(JNIEnv* env,
                                                                          jobject j_object,
                                                                          jlong j_view_handle,
-                                                                         jlong j_scalar_handle) {
+                                                                         jlong j_scalar_handle)
+{
   JNI_NULL_CHECK(env, j_view_handle, "haystack vector is null", false);
   JNI_NULL_CHECK(env, j_scalar_handle, "scalar needle is null", false);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
-    cudf::scalar *scalar = reinterpret_cast<cudf::scalar *>(j_scalar_handle);
+    cudf::column_view* column_view = reinterpret_cast<cudf::column_view*>(j_view_handle);
+    cudf::scalar* scalar           = reinterpret_cast<cudf::scalar*>(j_scalar_handle);
 
     return cudf::contains(*column_view, *scalar);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsVector(JNIEnv *env, jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsVector(JNIEnv* env,
+                                                                      jobject j_object,
                                                                       jlong j_values_handle,
-                                                                      jlong j_search_space_handle) {
+                                                                      jlong j_search_space_handle)
+{
   JNI_NULL_CHECK(env, j_values_handle, "values vector is null", false);
   JNI_NULL_CHECK(env, j_search_space_handle, "search_space vector is null", false);
   try {
     cudf::jni::auto_set_device(env);
-    auto const search_space_ptr =
-        reinterpret_cast<cudf::column_view const *>(j_search_space_handle);
-    auto const values_ptr = reinterpret_cast<cudf::column_view const *>(j_values_handle);
+    auto const search_space_ptr = reinterpret_cast<cudf::column_view const*>(j_search_space_handle);
+    auto const values_ptr       = reinterpret_cast<cudf::column_view const*>(j_values_handle);
 
     // The C++ API `cudf::contains` requires that the search space is the first parameter.
     return release_as_jlong(cudf::contains(*search_space_ptr, *values_ptr));
@@ -1328,141 +1454,149 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsVector(JNIEnv *en
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_transform(JNIEnv *env, jobject j_object,
-                                                                 jlong handle, jstring j_udf,
-                                                                 jboolean j_is_ptx) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_transform(
+  JNIEnv* env, jobject j_object, jlong handle, jstring j_udf, jboolean j_is_ptx)
+{
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     cudf::jni::native_jstring n_j_udf(env, j_udf);
     std::string n_udf(n_j_udf.get());
     return release_as_jlong(
-        cudf::transform(*column, n_udf, cudf::data_type(cudf::type_id::INT32), j_is_ptx));
+      cudf::transform(*column, n_udf, cudf::data_type(cudf::type_id::INT32), j_is_ptx));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringStartWith(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringStartWith(JNIEnv* env,
                                                                        jobject j_object,
                                                                        jlong j_view_handle,
-                                                                       jlong comp_string) {
+                                                                       jlong comp_string)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", false);
   JNI_NULL_CHECK(env, comp_string, "comparison string scalar is null", false);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
+    cudf::column_view* column_view = reinterpret_cast<cudf::column_view*>(j_view_handle);
     cudf::strings_column_view strings_column(*column_view);
-    cudf::string_scalar *comp_scalar = reinterpret_cast<cudf::string_scalar *>(comp_string);
+    cudf::string_scalar* comp_scalar = reinterpret_cast<cudf::string_scalar*>(comp_string);
     return release_as_jlong(cudf::strings::starts_with(strings_column, *comp_scalar));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringEndWith(JNIEnv *env, jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringEndWith(JNIEnv* env,
+                                                                     jobject j_object,
                                                                      jlong j_view_handle,
-                                                                     jlong comp_string) {
+                                                                     jlong comp_string)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", false);
   JNI_NULL_CHECK(env, comp_string, "comparison string scalar is null", false);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
+    cudf::column_view* column_view = reinterpret_cast<cudf::column_view*>(j_view_handle);
     cudf::strings_column_view strings_column(*column_view);
-    cudf::string_scalar *comp_scalar = reinterpret_cast<cudf::string_scalar *>(comp_string);
+    cudf::string_scalar* comp_scalar = reinterpret_cast<cudf::string_scalar*>(comp_string);
     return release_as_jlong(cudf::strings::ends_with(strings_column, *comp_scalar));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringContains(JNIEnv *env, jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringContains(JNIEnv* env,
+                                                                      jobject j_object,
                                                                       jlong j_view_handle,
-                                                                      jlong comp_string) {
+                                                                      jlong comp_string)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", false);
   JNI_NULL_CHECK(env, comp_string, "comparison string scalar is null", false);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
+    cudf::column_view* column_view = reinterpret_cast<cudf::column_view*>(j_view_handle);
     cudf::strings_column_view strings_column(*column_view);
-    cudf::string_scalar *comp_scalar = reinterpret_cast<cudf::string_scalar *>(comp_string);
+    cudf::string_scalar* comp_scalar = reinterpret_cast<cudf::string_scalar*>(comp_string);
     return release_as_jlong(cudf::strings::contains(strings_column, *comp_scalar));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_matchesRe(JNIEnv *env, jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_matchesRe(JNIEnv* env,
+                                                                 jobject j_object,
                                                                  jlong j_view_handle,
                                                                  jstring pattern_obj,
                                                                  jint regex_flags,
-                                                                 jint capture_groups) {
+                                                                 jint capture_groups)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", false);
   JNI_NULL_CHECK(env, pattern_obj, "pattern is null", false);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const column_view = reinterpret_cast<cudf::column_view const *>(j_view_handle);
+    auto const column_view    = reinterpret_cast<cudf::column_view const*>(j_view_handle);
     auto const strings_column = cudf::strings_column_view{*column_view};
-    auto const pattern = cudf::jni::native_jstring(env, pattern_obj);
-    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
-    auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
-    auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups);
+    auto const pattern        = cudf::jni::native_jstring(env, pattern_obj);
+    auto const flags          = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const groups         = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog     = cudf::strings::regex_program::create(pattern.get(), flags, groups);
     return release_as_jlong(cudf::strings::matches_re(strings_column, *regex_prog));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsRe(JNIEnv *env, jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsRe(JNIEnv* env,
+                                                                  jobject j_object,
                                                                   jlong j_view_handle,
                                                                   jstring pattern_obj,
                                                                   jint regex_flags,
-                                                                  jint capture_groups) {
+                                                                  jint capture_groups)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", false);
   JNI_NULL_CHECK(env, pattern_obj, "pattern is null", false);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const column_view = reinterpret_cast<cudf::column_view const *>(j_view_handle);
+    auto const column_view    = reinterpret_cast<cudf::column_view const*>(j_view_handle);
     auto const strings_column = cudf::strings_column_view{*column_view};
-    auto const pattern = cudf::jni::native_jstring(env, pattern_obj);
-    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
-    auto const capture = static_cast<cudf::strings::capture_groups>(capture_groups);
-    auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, capture);
+    auto const pattern        = cudf::jni::native_jstring(env, pattern_obj);
+    auto const flags          = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const capture        = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog     = cudf::strings::regex_program::create(pattern.get(), flags, capture);
     return release_as_jlong(cudf::strings::contains_re(strings_column, *regex_prog));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_like(JNIEnv *env, jobject j_object,
-                                                            jlong j_view_handle, jlong pattern,
-                                                            jlong escapeChar) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_like(
+  JNIEnv* env, jobject j_object, jlong j_view_handle, jlong pattern, jlong escapeChar)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", false);
   JNI_NULL_CHECK(env, pattern, "pattern is null", false);
   JNI_NULL_CHECK(env, escapeChar, "escape character is null", false);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const column_view = reinterpret_cast<cudf::column_view const *>(j_view_handle);
+    auto const column_view    = reinterpret_cast<cudf::column_view const*>(j_view_handle);
     auto const strings_column = cudf::strings_column_view{*column_view};
-    auto const pattern_scalar = reinterpret_cast<cudf::string_scalar const *>(pattern);
-    auto const escape_scalar = reinterpret_cast<cudf::string_scalar const *>(escapeChar);
+    auto const pattern_scalar = reinterpret_cast<cudf::string_scalar const*>(pattern);
+    auto const escape_scalar  = reinterpret_cast<cudf::string_scalar const*>(escapeChar);
     return release_as_jlong(cudf::strings::like(strings_column, *pattern_scalar, *escape_scalar));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVV(JNIEnv *env, jclass,
-                                                                  jlong lhs_view, jlong rhs_view,
-                                                                  jint int_op, jint out_dtype,
-                                                                  jint scale) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVV(
+  JNIEnv* env, jclass, jlong lhs_view, jlong rhs_view, jint int_op, jint out_dtype, jint scale)
+{
   JNI_NULL_CHECK(env, lhs_view, "lhs is null", 0);
   JNI_NULL_CHECK(env, rhs_view, "rhs is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto lhs = reinterpret_cast<cudf::column_view *>(lhs_view);
-    auto rhs = reinterpret_cast<cudf::column_view *>(rhs_view);
+    auto lhs                    = reinterpret_cast<cudf::column_view*>(lhs_view);
+    auto rhs                    = reinterpret_cast<cudf::column_view*>(rhs_view);
     cudf::data_type n_data_type = cudf::jni::make_data_type(out_dtype, scale);
-    cudf::binary_operator op = static_cast<cudf::binary_operator>(int_op);
+    cudf::binary_operator op    = static_cast<cudf::binary_operator>(int_op);
 
     if (lhs->type().id() == cudf::type_id::STRUCT) {
       auto out = make_fixed_width_column(n_data_type, lhs->size(), cudf::mask_state::UNALLOCATED);
@@ -1476,7 +1610,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVV(JNIEnv *env, j
 
       auto out_view = out->mutable_view();
       cudf::binops::compiled::detail::apply_sorting_struct_binary_op(
-          out_view, *lhs, *rhs, false, false, op, cudf::get_default_stream());
+        out_view, *lhs, *rhs, false, false, op, cudf::get_default_stream());
       return release_as_jlong(out);
     }
 
@@ -1485,30 +1619,28 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVV(JNIEnv *env, j
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_fixedPointOutputScale(JNIEnv *env, jclass,
-                                                                            jint int_op,
-                                                                            jint lhs_scale,
-                                                                            jint rhs_scale) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_fixedPointOutputScale(
+  JNIEnv* env, jclass, jint int_op, jint lhs_scale, jint rhs_scale)
+{
   try {
     // we just return the scale as the types will be the same as the lhs input
-    return cudf::binary_operation_fixed_point_scale(static_cast<cudf::binary_operator>(int_op),
-                                                    lhs_scale, rhs_scale);
+    return cudf::binary_operation_fixed_point_scale(
+      static_cast<cudf::binary_operator>(int_op), lhs_scale, rhs_scale);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVS(JNIEnv *env, jclass,
-                                                                  jlong lhs_view, jlong rhs_ptr,
-                                                                  jint int_op, jint out_dtype,
-                                                                  jint scale) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVS(
+  JNIEnv* env, jclass, jlong lhs_view, jlong rhs_ptr, jint int_op, jint out_dtype, jint scale)
+{
   JNI_NULL_CHECK(env, lhs_view, "lhs is null", 0);
   JNI_NULL_CHECK(env, rhs_ptr, "rhs is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto lhs = reinterpret_cast<cudf::column_view *>(lhs_view);
-    cudf::scalar *rhs = reinterpret_cast<cudf::scalar *>(rhs_ptr);
+    auto lhs                    = reinterpret_cast<cudf::column_view*>(lhs_view);
+    cudf::scalar* rhs           = reinterpret_cast<cudf::scalar*>(rhs_ptr);
     cudf::data_type n_data_type = cudf::jni::make_data_type(out_dtype, scale);
-    cudf::binary_operator op = static_cast<cudf::binary_operator>(int_op);
+    cudf::binary_operator op    = static_cast<cudf::binary_operator>(int_op);
 
     if (lhs->type().id() == cudf::type_id::STRUCT) {
       auto out = make_fixed_width_column(n_data_type, lhs->size(), cudf::mask_state::UNALLOCATED);
@@ -1520,10 +1652,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVS(JNIEnv *env, j
         out->set_null_mask(std::move(new_mask), new_null_count);
       }
 
-      auto rhsv = cudf::make_column_from_scalar(*rhs, 1);
+      auto rhsv     = cudf::make_column_from_scalar(*rhs, 1);
       auto out_view = out->mutable_view();
       cudf::binops::compiled::detail::apply_sorting_struct_binary_op(
-          out_view, *lhs, rhsv->view(), false, true, op, cudf::get_default_stream());
+        out_view, *lhs, rhsv->view(), false, true, op, cudf::get_default_stream());
       return release_as_jlong(out);
     }
 
@@ -1532,233 +1664,251 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVS(JNIEnv *env, j
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substringS(JNIEnv *env, jclass,
-                                                                  jlong cv_handle, jint start) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substringS(JNIEnv* env,
+                                                                  jclass,
+                                                                  jlong cv_handle,
+                                                                  jint start)
+{
   JNI_NULL_CHECK(env, cv_handle, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const cv = reinterpret_cast<cudf::column_view const *>(cv_handle);
+    auto const cv  = reinterpret_cast<cudf::column_view const*>(cv_handle);
     auto const scv = cudf::strings_column_view{*cv};
     return release_as_jlong(cudf::strings::slice_strings(scv, start));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substring(JNIEnv *env, jclass,
-                                                                 jlong column_view, jint start,
-                                                                 jint end) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substring(
+  JNIEnv* env, jclass, jlong column_view, jint start, jint end)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::strings_column_view scv(*cv);
     return release_as_jlong(cudf::strings::slice_strings(scv, start, end));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substringColumn(JNIEnv *env, jclass,
-                                                                       jlong column_view,
-                                                                       jlong start_column,
-                                                                       jlong end_column) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substringColumn(
+  JNIEnv* env, jclass, jlong column_view, jlong start_column, jlong end_column)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, start_column, "column is null", 0);
   JNI_NULL_CHECK(env, end_column, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::strings_column_view scv(*cv);
-    cudf::column_view *sc = reinterpret_cast<cudf::column_view *>(start_column);
-    cudf::column_view *ec = reinterpret_cast<cudf::column_view *>(end_column);
+    cudf::column_view* sc = reinterpret_cast<cudf::column_view*>(start_column);
+    cudf::column_view* ec = reinterpret_cast<cudf::column_view*>(end_column);
     return release_as_jlong(cudf::strings::slice_strings(scv, *sc, *ec));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substringLocate(JNIEnv *env, jclass,
-                                                                       jlong column_view,
-                                                                       jlong substring, jint start,
-                                                                       jint end) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substringLocate(
+  JNIEnv* env, jclass, jlong column_view, jlong substring, jint start, jint end)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, substring, "target string scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::strings_column_view scv(*cv);
-    cudf::string_scalar *ss_scalar = reinterpret_cast<cudf::string_scalar *>(substring);
+    cudf::string_scalar* ss_scalar = reinterpret_cast<cudf::string_scalar*>(substring);
     return release_as_jlong(cudf::strings::find(scv, *ss_scalar, start, end));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplace(JNIEnv *env, jclass,
-                                                                     jlong column_view,
-                                                                     jlong target, jlong replace) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplace(
+  JNIEnv* env, jclass, jlong column_view, jlong target, jlong replace)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, target, "target string scalar is null", 0);
   JNI_NULL_CHECK(env, replace, "replace string scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::strings_column_view scv(*cv);
-    cudf::string_scalar *ss_target = reinterpret_cast<cudf::string_scalar *>(target);
-    cudf::string_scalar *ss_replace = reinterpret_cast<cudf::string_scalar *>(replace);
+    cudf::string_scalar* ss_target  = reinterpret_cast<cudf::string_scalar*>(target);
+    cudf::string_scalar* ss_replace = reinterpret_cast<cudf::string_scalar*>(replace);
     return release_as_jlong(cudf::strings::replace(scv, *ss_target, *ss_replace));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplaceMulti(JNIEnv *env, jclass,
-                                                                          jlong inputs_cv,
-                                                                          jlong targets_cv,
-                                                                          jlong repls_cv) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplaceMulti(
+  JNIEnv* env, jclass, jlong inputs_cv, jlong targets_cv, jlong repls_cv)
+{
   JNI_NULL_CHECK(env, inputs_cv, "column is null", 0);
   JNI_NULL_CHECK(env, targets_cv, "targets string column view is null", 0);
   JNI_NULL_CHECK(env, repls_cv, "repls string column view is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(inputs_cv);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(inputs_cv);
     cudf::strings_column_view scv(*cv);
-    cudf::column_view *cvtargets = reinterpret_cast<cudf::column_view *>(targets_cv);
+    cudf::column_view* cvtargets = reinterpret_cast<cudf::column_view*>(targets_cv);
     cudf::strings_column_view scvtargets(*cvtargets);
-    cudf::column_view *cvrepls = reinterpret_cast<cudf::column_view *>(repls_cv);
+    cudf::column_view* cvrepls = reinterpret_cast<cudf::column_view*>(repls_cv);
     cudf::strings_column_view scvrepls(*cvrepls);
     return release_as_jlong(cudf::strings::replace(scv, scvtargets, scvrepls));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapLookupForKeys(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapLookupForKeys(JNIEnv* env,
+                                                                        jclass,
                                                                         jlong map_column_view,
-                                                                        jlong lookup_keys) {
+                                                                        jlong lookup_keys)
+{
   JNI_NULL_CHECK(env, map_column_view, "column is null", 0);
   JNI_NULL_CHECK(env, lookup_keys, "lookup key is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const *cv = reinterpret_cast<cudf::column_view *>(map_column_view);
-    auto const *column_keys = reinterpret_cast<cudf::column_view *>(lookup_keys);
-    auto const maps_view = cudf::jni::maps_column_view{*cv};
+    auto const* cv          = reinterpret_cast<cudf::column_view*>(map_column_view);
+    auto const* column_keys = reinterpret_cast<cudf::column_view*>(lookup_keys);
+    auto const maps_view    = cudf::jni::maps_column_view{*cv};
     return release_as_jlong(maps_view.get_values_for(*column_keys));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapLookup(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapLookup(JNIEnv* env,
+                                                                 jclass,
                                                                  jlong map_column_view,
-                                                                 jlong lookup_key) {
+                                                                 jlong lookup_key)
+{
   JNI_NULL_CHECK(env, map_column_view, "column is null", 0);
   JNI_NULL_CHECK(env, lookup_key, "lookup key is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const *cv = reinterpret_cast<cudf::column_view *>(map_column_view);
-    auto const *scalar_key = reinterpret_cast<cudf::scalar *>(lookup_key);
-    auto const maps_view = cudf::jni::maps_column_view{*cv};
+    auto const* cv         = reinterpret_cast<cudf::column_view*>(map_column_view);
+    auto const* scalar_key = reinterpret_cast<cudf::scalar*>(lookup_key);
+    auto const maps_view   = cudf::jni::maps_column_view{*cv};
     return release_as_jlong(maps_view.get_values_for(*scalar_key));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapContainsKeys(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapContainsKeys(JNIEnv* env,
+                                                                       jclass,
                                                                        jlong map_column_view,
-                                                                       jlong lookup_keys) {
+                                                                       jlong lookup_keys)
+{
   JNI_NULL_CHECK(env, map_column_view, "column is null", 0);
   JNI_NULL_CHECK(env, lookup_keys, "lookup key is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const *cv = reinterpret_cast<cudf::column_view *>(map_column_view);
-    auto const *column_key = reinterpret_cast<cudf::column_view *>(lookup_keys);
-    auto const maps_view = cudf::jni::maps_column_view{*cv};
+    auto const* cv         = reinterpret_cast<cudf::column_view*>(map_column_view);
+    auto const* column_key = reinterpret_cast<cudf::column_view*>(lookup_keys);
+    auto const maps_view   = cudf::jni::maps_column_view{*cv};
     return release_as_jlong(maps_view.contains(*column_key));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapContains(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapContains(JNIEnv* env,
+                                                                   jclass,
                                                                    jlong map_column_view,
-                                                                   jlong lookup_key) {
+                                                                   jlong lookup_key)
+{
   JNI_NULL_CHECK(env, map_column_view, "column is null", 0);
   JNI_NULL_CHECK(env, lookup_key, "lookup key is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const *cv = reinterpret_cast<cudf::column_view *>(map_column_view);
-    auto const *scalar_key = reinterpret_cast<cudf::scalar *>(lookup_key);
-    auto const maps_view = cudf::jni::maps_column_view{*cv};
+    auto const* cv         = reinterpret_cast<cudf::column_view*>(map_column_view);
+    auto const* scalar_key = reinterpret_cast<cudf::scalar*>(lookup_key);
+    auto const maps_view   = cudf::jni::maps_column_view{*cv};
     return release_as_jlong(maps_view.contains(*scalar_key));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceRegex(
-    JNIEnv *env, jclass, jlong j_column_view, jstring j_pattern, jint regex_flags,
-    jint capture_groups, jlong j_repl, jlong j_maxrepl) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceRegex(JNIEnv* env,
+                                                                    jclass,
+                                                                    jlong j_column_view,
+                                                                    jstring j_pattern,
+                                                                    jint regex_flags,
+                                                                    jint capture_groups,
+                                                                    jlong j_repl,
+                                                                    jlong j_maxrepl)
+{
   JNI_NULL_CHECK(env, j_column_view, "column is null", 0);
   JNI_NULL_CHECK(env, j_pattern, "pattern string is null", 0);
   JNI_NULL_CHECK(env, j_repl, "replace scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const cv = reinterpret_cast<cudf::column_view const *>(j_column_view);
+    auto const cv             = reinterpret_cast<cudf::column_view const*>(j_column_view);
     auto const strings_column = cudf::strings_column_view{*cv};
-    auto const pattern = cudf::jni::native_jstring(env, j_pattern);
-    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
-    auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
-    auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups);
-    auto const repl = reinterpret_cast<cudf::string_scalar const *>(j_repl);
+    auto const pattern        = cudf::jni::native_jstring(env, j_pattern);
+    auto const flags          = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const groups         = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog     = cudf::strings::regex_program::create(pattern.get(), flags, groups);
+    auto const repl           = reinterpret_cast<cudf::string_scalar const*>(j_repl);
     return release_as_jlong(
-        cudf::strings::replace_re(strings_column, *regex_prog, *repl, j_maxrepl));
+      cudf::strings::replace_re(strings_column, *regex_prog, *repl, j_maxrepl));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceMultiRegex(JNIEnv *env, jclass,
-                                                                         jlong j_column_view,
-                                                                         jobjectArray j_patterns,
-                                                                         jlong j_repls) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceMultiRegex(
+  JNIEnv* env, jclass, jlong j_column_view, jobjectArray j_patterns, jlong j_repls)
+{
   JNI_NULL_CHECK(env, j_column_view, "column is null", 0);
   JNI_NULL_CHECK(env, j_patterns, "patterns is null", 0);
   JNI_NULL_CHECK(env, j_repls, "repls is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto cv = reinterpret_cast<cudf::column_view const *>(j_column_view);
+    auto cv = reinterpret_cast<cudf::column_view const*>(j_column_view);
     cudf::strings_column_view scv(*cv);
     cudf::jni::native_jstringArray patterns(env, j_patterns);
-    auto repl_cv = reinterpret_cast<cudf::column_view const *>(j_repls);
+    auto repl_cv = reinterpret_cast<cudf::column_view const*>(j_repls);
     cudf::strings_column_view repl_scv(*repl_cv);
     return release_as_jlong(cudf::strings::replace_re(scv, patterns.as_cpp_vector(), repl_scv));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplaceWithBackrefs(
-    JNIEnv *env, jclass, jlong j_column_view, jstring pattern_obj, jint regex_flags,
-    jint capture_groups, jstring replace_obj) {
-
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ColumnView_stringReplaceWithBackrefs(JNIEnv* env,
+                                                         jclass,
+                                                         jlong j_column_view,
+                                                         jstring pattern_obj,
+                                                         jint regex_flags,
+                                                         jint capture_groups,
+                                                         jstring replace_obj)
+{
   JNI_NULL_CHECK(env, j_column_view, "column is null", 0);
   JNI_NULL_CHECK(env, pattern_obj, "pattern string is null", 0);
   JNI_NULL_CHECK(env, replace_obj, "replace string is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const cv = reinterpret_cast<cudf::column_view const *>(j_column_view);
+    auto const cv             = reinterpret_cast<cudf::column_view const*>(j_column_view);
     auto const strings_column = cudf::strings_column_view{*cv};
-    auto const pattern = cudf::jni::native_jstring(env, pattern_obj);
-    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
-    auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
-    auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups);
+    auto const pattern        = cudf::jni::native_jstring(env, pattern_obj);
+    auto const flags          = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const groups         = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog     = cudf::strings::regex_program::create(pattern.get(), flags, groups);
     cudf::jni::native_jstring ss_replace(env, replace_obj);
     return release_as_jlong(
-        cudf::strings::replace_with_backrefs(strings_column, *regex_prog, ss_replace.get()));
+      cudf::strings::replace_with_backrefs(strings_column, *regex_prog, ss_replace.get()));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_zfill(JNIEnv *env, jclass, jlong column_view,
-                                                             jint j_width) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_zfill(JNIEnv* env,
+                                                             jclass,
+                                                             jlong column_view,
+                                                             jint j_width)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::strings_column_view scv(*cv);
     cudf::size_type width = reinterpret_cast<cudf::size_type>(j_width);
     return release_as_jlong(cudf::strings::zfill(scv, width));
@@ -1766,17 +1916,16 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_zfill(JNIEnv *env, jclass
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_pad(JNIEnv *env, jclass, jlong column_view,
-                                                           jint j_width, jint j_side,
-                                                           jstring fill_char) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_pad(
+  JNIEnv* env, jclass, jlong column_view, jint j_width, jint j_side, jstring fill_char)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, fill_char, "fill_char is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::strings_column_view scv(*cv);
-    cudf::size_type width = reinterpret_cast<cudf::size_type>(j_width);
+    cudf::size_type width         = reinterpret_cast<cudf::size_type>(j_width);
     cudf::strings::side_type side = static_cast<cudf::strings::side_type>(j_side);
     cudf::jni::native_jstring ss_fill(env, fill_char);
     return release_as_jlong(cudf::strings::pad(scv, width, side, ss_fill.get()));
@@ -1784,113 +1933,125 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_pad(JNIEnv *env, jclass,
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringStrip(JNIEnv *env, jclass,
-                                                                   jlong column_view,
-                                                                   jint strip_type,
-                                                                   jlong to_strip) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringStrip(
+  JNIEnv* env, jclass, jlong column_view, jint strip_type, jlong to_strip)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, to_strip, "to_strip scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::strings_column_view scv(*cv);
     cudf::strings::side_type s_striptype = static_cast<cudf::strings::side_type>(strip_type);
-    cudf::string_scalar *ss_tostrip = reinterpret_cast<cudf::string_scalar *>(to_strip);
+    cudf::string_scalar* ss_tostrip      = reinterpret_cast<cudf::string_scalar*>(to_strip);
     return release_as_jlong(cudf::strings::strip(scv, s_striptype, *ss_tostrip));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_extractRe(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_extractRe(JNIEnv* env,
+                                                                      jclass,
                                                                       jlong j_view_handle,
                                                                       jstring pattern_obj,
                                                                       jint regex_flags,
-                                                                      jint capture_groups) {
+                                                                      jint capture_groups)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", nullptr);
   JNI_NULL_CHECK(env, pattern_obj, "pattern is null", nullptr);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const column_view = reinterpret_cast<cudf::column_view const *>(j_view_handle);
+    auto const column_view    = reinterpret_cast<cudf::column_view const*>(j_view_handle);
     auto const strings_column = cudf::strings_column_view{*column_view};
-    auto const pattern = cudf::jni::native_jstring(env, pattern_obj);
-    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
-    auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
-    auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups);
+    auto const pattern        = cudf::jni::native_jstring(env, pattern_obj);
+    auto const flags          = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const groups         = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog     = cudf::strings::regex_program::create(pattern.get(), flags, groups);
     return cudf::jni::convert_table_for_return(env,
                                                cudf::strings::extract(strings_column, *regex_prog));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractAllRecord(
-    JNIEnv *env, jclass, jlong j_view_handle, jstring pattern_obj, jint regex_flags,
-    jint capture_groups, jint idx) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractAllRecord(JNIEnv* env,
+                                                                        jclass,
+                                                                        jlong j_view_handle,
+                                                                        jstring pattern_obj,
+                                                                        jint regex_flags,
+                                                                        jint capture_groups,
+                                                                        jint idx)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", 0);
   JNI_NULL_CHECK(env, pattern_obj, "pattern is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const column_view = reinterpret_cast<cudf::column_view const *>(j_view_handle);
+    auto const column_view    = reinterpret_cast<cudf::column_view const*>(j_view_handle);
     auto const strings_column = cudf::strings_column_view{*column_view};
-    auto const pattern = cudf::jni::native_jstring(env, pattern_obj);
-    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
-    auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
-    auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups);
-    auto result = (idx == 0) ? cudf::strings::findall(strings_column, *regex_prog) :
-                               cudf::strings::extract_all_record(strings_column, *regex_prog);
+    auto const pattern        = cudf::jni::native_jstring(env, pattern_obj);
+    auto const flags          = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const groups         = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog     = cudf::strings::regex_program::create(pattern.get(), flags, groups);
+    auto result               = (idx == 0) ? cudf::strings::findall(strings_column, *regex_prog)
+                                           : cudf::strings::extract_all_record(strings_column, *regex_prog);
     return release_as_jlong(result);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_urlDecode(JNIEnv *env, jclass,
-                                                                 jlong j_view_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_urlDecode(JNIEnv* env,
+                                                                 jclass,
+                                                                 jlong j_view_handle)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto view_ptr = reinterpret_cast<cudf::column_view *>(j_view_handle);
+    auto view_ptr = reinterpret_cast<cudf::column_view*>(j_view_handle);
     cudf::strings_column_view strings_view(*view_ptr);
     return release_as_jlong(cudf::strings::url_decode(strings_view));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_urlEncode(JNIEnv *env, jclass,
-                                                                 jlong j_view_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_urlEncode(JNIEnv* env,
+                                                                 jclass,
+                                                                 jlong j_view_handle)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto view_ptr = reinterpret_cast<cudf::column_view *>(j_view_handle);
+    auto view_ptr = reinterpret_cast<cudf::column_view*>(j_view_handle);
     cudf::strings_column_view strings_view(*view_ptr);
     return release_as_jlong(cudf::strings::url_encode(strings_view));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_normalizeNANsAndZeros(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_normalizeNANsAndZeros(JNIEnv* env,
                                                                              jclass clazz,
-                                                                             jlong input_column) {
+                                                                             jlong input_column)
+{
   using cudf::column_view;
 
   JNI_NULL_CHECK(env, input_column, "Input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
     return release_as_jlong(
-        cudf::normalize_nans_and_zeros(*reinterpret_cast<column_view *>(input_column)));
+      cudf::normalize_nans_and_zeros(*reinterpret_cast<column_view*>(input_column)));
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidity(
-    JNIEnv *env, jobject j_object, jlong base_column, jlongArray column_handles, jint bin_op) {
+  JNIEnv* env, jobject j_object, jlong base_column, jlongArray column_handles, jint bin_op)
+{
   JNI_NULL_CHECK(env, base_column, "base column native handle is null", 0);
   JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *original_column = reinterpret_cast<cudf::column_view *>(base_column);
+    cudf::column_view* original_column = reinterpret_cast<cudf::column_view*>(base_column);
     std::unique_ptr<cudf::column> copy(new cudf::column(*original_column));
     cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
 
@@ -1904,7 +2065,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit
       case cudf::binary_operator::BITWISE_AND: {
         auto cols = n_cudf_columns.get_dereferenced();
         cols.push_back(copy->view());
-        auto table_view = cudf::table_view{cols};
+        auto table_view                = cudf::table_view{cols};
         auto [new_bitmask, null_count] = cudf::bitmask_and(table_view);
         copy->set_null_mask(std::move(new_bitmask), null_count);
         break;
@@ -1922,9 +2083,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit
       default: JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Unsupported merge operation", 0);
     }
     auto const copy_cv = copy->view();
-    if (cudf::has_nonempty_nulls(copy_cv)) {
-      copy = cudf::purge_nonempty_nulls(copy_cv);
-    }
+    if (cudf::has_nonempty_nulls(copy_cv)) { copy = cudf::purge_nonempty_nulls(copy_cv); }
 
     return release_as_jlong(copy);
   }
@@ -1932,15 +2091,16 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyWithBooleanColumnAsValidity(
-    JNIEnv *env, jobject j_object, jlong exemplar_handle, jlong validity_column_handle) {
+  JNIEnv* env, jobject j_object, jlong exemplar_handle, jlong validity_column_handle)
+{
   JNI_NULL_CHECK(env, exemplar_handle, "ColumnView handle is null", 0);
   JNI_NULL_CHECK(env, validity_column_handle, "Validity column handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const exemplar = *reinterpret_cast<cudf::column_view *>(exemplar_handle);
-    auto const validity = *reinterpret_cast<cudf::column_view *>(validity_column_handle);
+    auto const exemplar = *reinterpret_cast<cudf::column_view*>(exemplar_handle);
+    auto const validity = *reinterpret_cast<cudf::column_view*>(validity_column_handle);
     return release_as_jlong(
-        cudf::jni::new_column_with_boolean_column_as_validity(exemplar, validity));
+      cudf::jni::new_column_with_boolean_column_as_validity(exemplar, validity));
   }
   CATCH_STD(env, 0);
 }
@@ -1950,23 +2110,29 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyWithBooleanColumnAsVa
 // should typically only be called from the CudfColumn inner class.
 ////////
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeCudfColumnView(
-    JNIEnv *env, jclass, jint j_type, jint scale, jlong j_data, jlong j_data_size, jlong j_offset,
-    jlong j_valid, jint j_null_count, jint size, jlongArray j_children) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeCudfColumnView(JNIEnv* env,
+                                                                          jclass,
+                                                                          jint j_type,
+                                                                          jint scale,
+                                                                          jlong j_data,
+                                                                          jlong j_data_size,
+                                                                          jlong j_offset,
+                                                                          jlong j_valid,
+                                                                          jint j_null_count,
+                                                                          jint size,
+                                                                          jlongArray j_children)
+{
   try {
     using cudf::column_view;
     cudf::jni::auto_set_device(env);
-    cudf::type_id n_type = static_cast<cudf::type_id>(j_type);
+    cudf::type_id n_type        = static_cast<cudf::type_id>(j_type);
     cudf::data_type n_data_type = cudf::jni::make_data_type(j_type, scale);
 
-    void *data = reinterpret_cast<void *>(j_data);
-    cudf::bitmask_type *valid = reinterpret_cast<cudf::bitmask_type *>(j_valid);
-    if (valid == nullptr) {
-      j_null_count = 0;
-    }
+    void* data                = reinterpret_cast<void*>(j_data);
+    cudf::bitmask_type* valid = reinterpret_cast<cudf::bitmask_type*>(j_valid);
+    if (valid == nullptr) { j_null_count = 0; }
 
-    if (j_null_count < 0) { // Check for unknown null count.
+    if (j_null_count < 0) {  // Check for unknown null count.
       // Calculate concrete null count.
       j_null_count = cudf::null_count(valid, 0, size);
     }
@@ -1974,37 +2140,51 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeCudfColumnView(
     if (n_type == cudf::type_id::STRING) {
       if (size == 0) {
         return ptr_as_jlong(
-            new cudf::column_view(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0));
+          new cudf::column_view(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0));
       } else {
         JNI_NULL_CHECK(env, j_offset, "offset is null", 0);
-        cudf::size_type *offsets = reinterpret_cast<cudf::size_type *>(j_offset);
-        cudf::column_view offsets_column(cudf::data_type{cudf::type_id::INT32}, size + 1, offsets,
-                                         nullptr, 0);
-        return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::STRING}, size,
-                                                  data, valid, j_null_count, 0, {offsets_column}));
+        cudf::size_type* offsets = reinterpret_cast<cudf::size_type*>(j_offset);
+        cudf::column_view offsets_column(
+          cudf::data_type{cudf::type_id::INT32}, size + 1, offsets, nullptr, 0);
+        return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::STRING},
+                                                  size,
+                                                  data,
+                                                  valid,
+                                                  j_null_count,
+                                                  0,
+                                                  {offsets_column}));
       }
     } else if (n_type == cudf::type_id::LIST) {
       JNI_NULL_CHECK(env, j_children, "children of a list are null", 0);
       cudf::jni::native_jpointerArray<cudf::column_view> children(env, j_children);
       JNI_ARG_CHECK(env, (children.size() == 1), "LIST children size is not 1", 0);
       cudf::size_type offsets_size = 0;
-      cudf::size_type *offsets = nullptr;
+      cudf::size_type* offsets     = nullptr;
       if (size != 0) {
         JNI_NULL_CHECK(env, j_offset, "offset is null", 0);
         offsets_size = size + 1;
-        offsets = reinterpret_cast<cudf::size_type *>(j_offset);
+        offsets      = reinterpret_cast<cudf::size_type*>(j_offset);
       }
-      cudf::column_view offsets_column(cudf::data_type{cudf::type_id::INT32}, offsets_size, offsets,
-                                       nullptr, 0);
-      return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::LIST}, size, nullptr,
-                                                valid, j_null_count, 0,
+      cudf::column_view offsets_column(
+        cudf::data_type{cudf::type_id::INT32}, offsets_size, offsets, nullptr, 0);
+      return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::LIST},
+                                                size,
+                                                nullptr,
+                                                valid,
+                                                j_null_count,
+                                                0,
                                                 {offsets_column, *children[0]}));
     } else if (n_type == cudf::type_id::STRUCT) {
       JNI_NULL_CHECK(env, j_children, "children of a struct are null", 0);
       cudf::jni::native_jpointerArray<cudf::column_view> children(env, j_children);
       std::vector<column_view> children_vector = children.get_dereferenced();
-      return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::STRUCT}, size,
-                                                nullptr, valid, j_null_count, 0, children_vector));
+      return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::STRUCT},
+                                                size,
+                                                nullptr,
+                                                valid,
+                                                j_null_count,
+                                                0,
+                                                children_vector));
     } else {
       return ptr_as_jlong(new cudf::column_view(n_data_type, size, data, valid, j_null_count));
     }
@@ -2012,69 +2192,79 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeCudfColumnView(
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeTypeId(JNIEnv *env, jobject j_object,
-                                                                      jlong handle) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeTypeId(JNIEnv* env,
+                                                                      jobject j_object,
+                                                                      jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     return static_cast<jint>(column->type().id());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeTypeScale(JNIEnv *env, jclass,
-                                                                         jlong handle) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeTypeScale(JNIEnv* env,
+                                                                         jclass,
+                                                                         jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     return column->type().scale();
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeRowCount(JNIEnv *env, jclass,
-                                                                        jlong handle) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeRowCount(JNIEnv* env,
+                                                                        jclass,
+                                                                        jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     return static_cast<jint>(column->size());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeNullCount(JNIEnv *env,
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeNullCount(JNIEnv* env,
                                                                          jobject j_object,
-                                                                         jlong handle) {
+                                                                         jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     return static_cast<jint>(column->null_count());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_ColumnView_deleteColumnView(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ColumnView_deleteColumnView(JNIEnv* env,
                                                                        jobject j_object,
-                                                                       jlong handle) {
+                                                                       jlong handle)
+{
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* view = reinterpret_cast<cudf::column_view*>(handle);
     delete view;
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeDataAddress(JNIEnv *env, jclass,
-                                                                            jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeDataAddress(JNIEnv* env,
+                                                                            jclass,
+                                                                            jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    jlong result = 0;
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    jlong result              = 0;
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     if (column->type().id() == cudf::type_id::STRING) {
       if (column->size() > 0) {
         cudf::strings_column_view view = cudf::strings_column_view(*column);
@@ -2089,17 +2279,19 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeDataAddress(JNIE
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeDataLength(JNIEnv *env, jclass,
-                                                                           jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeDataLength(JNIEnv* env,
+                                                                           jclass,
+                                                                           jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    jlong result = 0;
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    jlong result              = 0;
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     if (column->type().id() == cudf::type_id::STRING) {
       if (column->size() > 0) {
         cudf::strings_column_view view = cudf::strings_column_view(*column);
-        result = view.chars_size(cudf::get_default_stream());
+        result                         = view.chars_size(cudf::get_default_stream());
       }
     } else if (column->type().id() != cudf::type_id::LIST &&
                column->type().id() != cudf::type_id::STRUCT) {
@@ -2110,14 +2302,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeDataLength(JNIEn
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeNumChildren(JNIEnv *env,
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeNumChildren(JNIEnv* env,
                                                                            jobject j_object,
-                                                                           jlong handle) {
-
+                                                                           jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     // Strings has children(offsets and chars) but not a nested child() we care about here.
     if (column->type().id() == cudf::type_id::STRING) {
       return 0;
@@ -2133,53 +2325,57 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeNumChildren(JNIEn
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getChildCvPointer(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getChildCvPointer(JNIEnv* env,
                                                                          jobject j_object,
                                                                          jlong handle,
-                                                                         jint child_index) {
+                                                                         jint child_index)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
-    auto const is_list = column->type().id() == cudf::type_id::LIST;
-    auto const child = column->child(child_index + (is_list ? 1 : 0));
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
+    auto const is_list        = column->type().id() == cudf::type_id::LIST;
+    auto const child          = column->child(child_index + (is_list ? 1 : 0));
     return ptr_as_jlong(new cudf::column_view(child));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getListOffsetCvPointer(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getListOffsetCvPointer(JNIEnv* env,
                                                                               jobject j_object,
-                                                                              jlong handle) {
+                                                                              jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
-    cudf::lists_column_view view = cudf::lists_column_view(*column);
+    cudf::column_view* column      = reinterpret_cast<cudf::column_view*>(handle);
+    cudf::lists_column_view view   = cudf::lists_column_view(*column);
     cudf::column_view offsets_view = view.offsets();
     return ptr_as_jlong(new cudf::column_view(offsets_view));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeOffsetsAddress(JNIEnv *env, jclass,
-                                                                               jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeOffsetsAddress(JNIEnv* env,
+                                                                               jclass,
+                                                                               jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    jlong result = 0;
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    jlong result              = 0;
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     if (column->type().id() == cudf::type_id::STRING) {
       if (column->size() > 0) {
         cudf::strings_column_view view = cudf::strings_column_view(*column);
         cudf::column_view offsets_view = view.offsets();
-        result = ptr_as_jlong(offsets_view.data<char>());
+        result                         = ptr_as_jlong(offsets_view.data<char>());
       }
     } else if (column->type().id() == cudf::type_id::LIST) {
       if (column->size() > 0) {
-        cudf::lists_column_view view = cudf::lists_column_view(*column);
+        cudf::lists_column_view view   = cudf::lists_column_view(*column);
         cudf::column_view offsets_view = view.offsets();
-        result = ptr_as_jlong(offsets_view.data<char>());
+        result                         = ptr_as_jlong(offsets_view.data<char>());
       }
     }
     return result;
@@ -2187,24 +2383,26 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeOffsetsAddress(J
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeOffsetsLength(JNIEnv *env, jclass,
-                                                                              jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeOffsetsLength(JNIEnv* env,
+                                                                              jclass,
+                                                                              jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    jlong result = 0;
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    jlong result              = 0;
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     if (column->type().id() == cudf::type_id::STRING) {
       if (column->size() > 0) {
         cudf::strings_column_view view = cudf::strings_column_view(*column);
         cudf::column_view offsets_view = view.offsets();
-        result = sizeof(int) * offsets_view.size();
+        result                         = sizeof(int) * offsets_view.size();
       }
     } else if (column->type().id() == cudf::type_id::LIST) {
       if (column->size() > 0) {
-        cudf::lists_column_view view = cudf::lists_column_view(*column);
+        cudf::lists_column_view view   = cudf::lists_column_view(*column);
         cudf::column_view offsets_view = view.offsets();
-        result = sizeof(int) * offsets_view.size();
+        result                         = sizeof(int) * offsets_view.size();
       }
     }
     return result;
@@ -2212,24 +2410,28 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeOffsetsLength(JN
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidityAddress(JNIEnv *env, jclass,
-                                                                                jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidityAddress(JNIEnv* env,
+                                                                                jclass,
+                                                                                jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     return ptr_as_jlong(column->null_mask());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidityLength(JNIEnv *env, jclass,
-                                                                               jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidityLength(JNIEnv* env,
+                                                                               jclass,
+                                                                               jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
-    jlong result = 0;
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
+    jlong result              = 0;
     if (column->null_mask() != nullptr) {
       result = cudf::bitmask_allocation_size_bytes(column->size());
     }
@@ -2238,28 +2440,33 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidityLength(J
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getDeviceMemorySize(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getDeviceMemorySize(JNIEnv* env,
+                                                                           jclass,
                                                                            jlong handle,
-                                                                           jboolean pad_for_cpu) {
+                                                                           jboolean pad_for_cpu)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto view = reinterpret_cast<cudf::column_view const *>(handle);
+    auto view = reinterpret_cast<cudf::column_view const*>(handle);
     return calc_device_memory_size(*view, pad_for_cpu);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_hostPaddingSizeInBytes(JNIEnv *env, jclass) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_hostPaddingSizeInBytes(JNIEnv* env, jclass)
+{
   return sizeof(std::max_align_t);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_clamper(JNIEnv *env, jobject j_object,
-                                                               jlong handle, jlong j_lo_scalar,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_clamper(JNIEnv* env,
+                                                               jobject j_object,
+                                                               jlong handle,
+                                                               jlong j_lo_scalar,
                                                                jlong j_lo_replace_scalar,
                                                                jlong j_hi_scalar,
-                                                               jlong j_hi_replace_scalar) {
-
+                                                               jlong j_hi_replace_scalar)
+{
   JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
   JNI_NULL_CHECK(env, j_lo_scalar, "lo scalar is null", 0)
   JNI_NULL_CHECK(env, j_lo_replace_scalar, "lo scalar replace value is null", 0)
@@ -2268,96 +2475,103 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_clamper(JNIEnv *env, jobj
   using cudf::clamp;
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column_view = reinterpret_cast<cudf::column_view *>(handle);
-    cudf::scalar *lo_scalar = reinterpret_cast<cudf::scalar *>(j_lo_scalar);
-    cudf::scalar *lo_replace_scalar = reinterpret_cast<cudf::scalar *>(j_lo_replace_scalar);
-    cudf::scalar *hi_scalar = reinterpret_cast<cudf::scalar *>(j_hi_scalar);
-    cudf::scalar *hi_replace_scalar = reinterpret_cast<cudf::scalar *>(j_hi_replace_scalar);
+    cudf::column_view* column_view  = reinterpret_cast<cudf::column_view*>(handle);
+    cudf::scalar* lo_scalar         = reinterpret_cast<cudf::scalar*>(j_lo_scalar);
+    cudf::scalar* lo_replace_scalar = reinterpret_cast<cudf::scalar*>(j_lo_replace_scalar);
+    cudf::scalar* hi_scalar         = reinterpret_cast<cudf::scalar*>(j_hi_scalar);
+    cudf::scalar* hi_replace_scalar = reinterpret_cast<cudf::scalar*>(j_hi_replace_scalar);
 
     return release_as_jlong(
-        clamp(*column_view, *lo_scalar, *lo_replace_scalar, *hi_scalar, *hi_replace_scalar));
+      clamp(*column_view, *lo_scalar, *lo_replace_scalar, *hi_scalar, *hi_replace_scalar));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_title(JNIEnv *env, jobject j_object,
-                                                             jlong handle) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_title(JNIEnv* env,
+                                                             jobject j_object,
+                                                             jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* view = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::strings::title(*view));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_capitalize(JNIEnv *env, jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_capitalize(JNIEnv* env,
+                                                                  jobject j_object,
                                                                   jlong strs_handle,
-                                                                  jlong delimiters_handle) {
-
+                                                                  jlong delimiters_handle)
+{
   JNI_NULL_CHECK(env, strs_handle, "native view handle is null", 0)
   JNI_NULL_CHECK(env, delimiters_handle, "delimiters scalar handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(strs_handle);
-    cudf::string_scalar *deli = reinterpret_cast<cudf::string_scalar *>(delimiters_handle);
+    cudf::column_view* view   = reinterpret_cast<cudf::column_view*>(strs_handle);
+    cudf::string_scalar* deli = reinterpret_cast<cudf::string_scalar*>(delimiters_handle);
     return release_as_jlong(cudf::strings::capitalize(*view, *deli));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_joinStrings(JNIEnv *env, jobject j_object,
-                                                                   jlong strs_handle,
-                                                                   jlong separator_handle,
-                                                                   jlong narep_handle) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_joinStrings(
+  JNIEnv* env, jobject j_object, jlong strs_handle, jlong separator_handle, jlong narep_handle)
+{
   JNI_NULL_CHECK(env, strs_handle, "native view handle is null", 0)
   JNI_NULL_CHECK(env, separator_handle, "separator scalar handle is null", 0)
   JNI_NULL_CHECK(env, narep_handle, "narep scalar handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(strs_handle);
-    cudf::string_scalar *sep = reinterpret_cast<cudf::string_scalar *>(separator_handle);
-    cudf::string_scalar *narep = reinterpret_cast<cudf::string_scalar *>(narep_handle);
+    cudf::column_view* view    = reinterpret_cast<cudf::column_view*>(strs_handle);
+    cudf::string_scalar* sep   = reinterpret_cast<cudf::string_scalar*>(separator_handle);
+    cudf::string_scalar* narep = reinterpret_cast<cudf::string_scalar*>(narep_handle);
     return release_as_jlong(cudf::strings::join_strings(*view, *sep, *narep));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeStructView(JNIEnv *env, jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeStructView(JNIEnv* env,
+                                                                      jobject j_object,
                                                                       jlongArray handles,
-                                                                      jlong row_count) {
-
+                                                                      jlong row_count)
+{
   JNI_NULL_CHECK(env, handles, "native view handles are null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto children = cudf::jni::native_jpointerArray<cudf::column_view>{env, handles};
+    auto children        = cudf::jni::native_jpointerArray<cudf::column_view>{env, handles};
     auto children_vector = children.get_dereferenced();
-    return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::STRUCT}, row_count,
-                                              nullptr, nullptr, 0, 0, children_vector));
+    return ptr_as_jlong(new cudf::column_view(
+      cudf::data_type{cudf::type_id::STRUCT}, row_count, nullptr, nullptr, 0, 0, children_vector));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_nansToNulls(JNIEnv *env, jobject j_object,
-                                                                   jlong handle) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_nansToNulls(JNIEnv* env,
+                                                                   jobject j_object,
+                                                                   jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = *reinterpret_cast<cudf::column_view *>(handle);
+    auto const input = *reinterpret_cast<cudf::column_view*>(handle);
     // get a new null mask by setting all the nans to null
     auto [new_nullmask, new_null_count] = cudf::nans_to_nulls(input);
     // create a column_view which is a no-copy wrapper around the original column without the null
     // mask
-    auto const input_without_nullmask = cudf::column_view(
-        input.type(), input.size(), input.head<void>(), nullptr, 0, input.offset(),
-        std::vector<cudf::column_view>{input.child_begin(), input.child_end()});
+    auto const input_without_nullmask =
+      cudf::column_view(input.type(),
+                        input.size(),
+                        input.head<void>(),
+                        nullptr,
+                        0,
+                        input.offset(),
+                        std::vector<cudf::column_view>{input.child_begin(), input.child_end()});
     // create a column by deep copying `input_without_nullmask`.
     auto deep_copy = std::make_unique<cudf::column>(input_without_nullmask);
     deep_copy->set_null_mask(std::move(*new_nullmask), new_null_count);
@@ -2366,99 +2580,106 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_nansToNulls(JNIEnv *env,
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isFloat(JNIEnv *env, jobject j_object,
-                                                               jlong handle) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isFloat(JNIEnv* env,
+                                                               jobject j_object,
+                                                               jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* view = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::strings::is_float(*view));
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isInteger(JNIEnv *env, jobject j_object,
-                                                                 jlong handle) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isInteger(JNIEnv* env,
+                                                                 jobject j_object,
+                                                                 jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* view = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::strings::is_integer(*view));
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isFixedPoint(JNIEnv *env, jobject,
-                                                                    jlong handle, jint j_dtype,
-                                                                    jint scale) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isFixedPoint(
+  JNIEnv* env, jobject, jlong handle, jint j_dtype, jint scale)
+{
   JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* view  = reinterpret_cast<cudf::column_view*>(handle);
     cudf::data_type fp_dtype = cudf::jni::make_data_type(j_dtype, scale);
     return release_as_jlong(cudf::strings::is_fixed_point(*view, fp_dtype));
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isIntegerWithType(JNIEnv *env, jobject,
-                                                                         jlong handle, jint j_dtype,
-                                                                         jint scale) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isIntegerWithType(
+  JNIEnv* env, jobject, jlong handle, jint j_dtype, jint scale)
+{
   JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* view   = reinterpret_cast<cudf::column_view*>(handle);
     cudf::data_type int_dtype = cudf::jni::make_data_type(j_dtype, scale);
     return release_as_jlong(cudf::strings::is_integer(*view, int_dtype));
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyColumnViewToCV(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyColumnViewToCV(JNIEnv* env,
                                                                           jobject j_object,
-                                                                          jlong handle) {
-
+                                                                          jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* view = reinterpret_cast<cudf::column_view*>(handle);
     return ptr_as_jlong(new cudf::column(*view));
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(
-    JNIEnv *env, jclass, jlong j_view_handle, jlong j_scalar_handle, jboolean allow_single_quotes,
-    jboolean strip_quotes_from_single_strings, jboolean missing_fields_as_nulls) {
-
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv* env,
+                                             jclass,
+                                             jlong j_view_handle,
+                                             jlong j_scalar_handle,
+                                             jboolean allow_single_quotes,
+                                             jboolean strip_quotes_from_single_strings,
+                                             jboolean missing_fields_as_nulls)
+{
   JNI_NULL_CHECK(env, j_view_handle, "view cannot be null", 0);
   JNI_NULL_CHECK(env, j_scalar_handle, "path cannot be null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *n_column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
+    cudf::column_view* n_column_view = reinterpret_cast<cudf::column_view*>(j_view_handle);
     cudf::strings_column_view n_strings_col_view(*n_column_view);
-    cudf::string_scalar *n_scalar_path = reinterpret_cast<cudf::string_scalar *>(j_scalar_handle);
-    auto options = cudf::get_json_object_options{};
+    cudf::string_scalar* n_scalar_path = reinterpret_cast<cudf::string_scalar*>(j_scalar_handle);
+    auto options                       = cudf::get_json_object_options{};
     options.set_allow_single_quotes(allow_single_quotes);
     options.set_strip_quotes_from_single_strings(strip_quotes_from_single_strings);
     options.set_missing_fields_as_nulls(missing_fields_as_nulls);
     auto result_col_ptr = [&]() {
       try {
         return cudf::get_json_object(n_strings_col_view, *n_scalar_path, options);
-      } catch (std::invalid_argument const &err) {
+      } catch (std::invalid_argument const& err) {
         auto const null_scalar = cudf::string_scalar(std::string(""), false);
         return cudf::make_column_from_scalar(null_scalar, n_strings_col_view.size());
-      } catch (...) { throw; }
+      } catch (...) {
+        throw;
+      }
     }();
     return release_as_jlong(result_col_ptr);
   }
@@ -2466,64 +2687,82 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringConcatenationListElementsSepCol(
-    JNIEnv *env, jclass, jlong column_handle, jlong sep_handle, jlong separator_narep,
-    jlong col_narep, jboolean separate_nulls, jboolean empty_string_output_if_empty_list) {
+  JNIEnv* env,
+  jclass,
+  jlong column_handle,
+  jlong sep_handle,
+  jlong separator_narep,
+  jlong col_narep,
+  jboolean separate_nulls,
+  jboolean empty_string_output_if_empty_list)
+{
   JNI_NULL_CHECK(env, column_handle, "column handle is null", 0);
   JNI_NULL_CHECK(env, sep_handle, "separator column handle is null", 0);
   JNI_NULL_CHECK(env, separator_narep, "separator narep string scalar object is null", 0);
   JNI_NULL_CHECK(env, col_narep, "column narep string scalar object is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const auto &separator_narep_scalar = *reinterpret_cast<cudf::string_scalar *>(separator_narep);
-    const auto &col_narep_scalar = *reinterpret_cast<cudf::string_scalar *>(col_narep);
-    auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES :
-                                        cudf::strings::separator_on_nulls::NO;
-    auto empty_list_output = empty_string_output_if_empty_list ?
-                                 cudf::strings::output_if_empty_list::EMPTY_STRING :
-                                 cudf::strings::output_if_empty_list::NULL_ELEMENT;
+    const auto& separator_narep_scalar = *reinterpret_cast<cudf::string_scalar*>(separator_narep);
+    const auto& col_narep_scalar       = *reinterpret_cast<cudf::string_scalar*>(col_narep);
+    auto null_policy                   = separate_nulls ? cudf::strings::separator_on_nulls::YES
+                                                        : cudf::strings::separator_on_nulls::NO;
+    auto empty_list_output             = empty_string_output_if_empty_list
+                                           ? cudf::strings::output_if_empty_list::EMPTY_STRING
+                                           : cudf::strings::output_if_empty_list::NULL_ELEMENT;
 
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(sep_handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(sep_handle);
     cudf::strings_column_view strings_column(*column);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_handle);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_handle);
     cudf::lists_column_view lcv(*cv);
-    return release_as_jlong(
-        cudf::strings::join_list_elements(lcv, strings_column, separator_narep_scalar,
-                                          col_narep_scalar, null_policy, empty_list_output));
+    return release_as_jlong(cudf::strings::join_list_elements(lcv,
+                                                              strings_column,
+                                                              separator_narep_scalar,
+                                                              col_narep_scalar,
+                                                              null_policy,
+                                                              empty_list_output));
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringConcatenationListElements(
-    JNIEnv *env, jclass, jlong column_handle, jlong separator, jlong narep, jboolean separate_nulls,
-    jboolean empty_string_output_if_empty_list) {
+  JNIEnv* env,
+  jclass,
+  jlong column_handle,
+  jlong separator,
+  jlong narep,
+  jboolean separate_nulls,
+  jboolean empty_string_output_if_empty_list)
+{
   JNI_NULL_CHECK(env, column_handle, "column handle is null", 0);
   JNI_NULL_CHECK(env, separator, "separator string scalar object is null", 0);
   JNI_NULL_CHECK(env, narep, "separator narep string scalar object is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const auto &separator_scalar = *reinterpret_cast<cudf::string_scalar *>(separator);
-    const auto &narep_scalar = *reinterpret_cast<cudf::string_scalar *>(narep);
-    auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES :
-                                        cudf::strings::separator_on_nulls::NO;
-    auto empty_list_output = empty_string_output_if_empty_list ?
-                                 cudf::strings::output_if_empty_list::EMPTY_STRING :
-                                 cudf::strings::output_if_empty_list::NULL_ELEMENT;
+    const auto& separator_scalar = *reinterpret_cast<cudf::string_scalar*>(separator);
+    const auto& narep_scalar     = *reinterpret_cast<cudf::string_scalar*>(narep);
+    auto null_policy             = separate_nulls ? cudf::strings::separator_on_nulls::YES
+                                                  : cudf::strings::separator_on_nulls::NO;
+    auto empty_list_output       = empty_string_output_if_empty_list
+                                     ? cudf::strings::output_if_empty_list::EMPTY_STRING
+                                     : cudf::strings::output_if_empty_list::NULL_ELEMENT;
 
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_handle);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_handle);
     cudf::lists_column_view lcv(*cv);
-    return release_as_jlong(cudf::strings::join_list_elements(lcv, separator_scalar, narep_scalar,
-                                                              null_policy, empty_list_output));
+    return release_as_jlong(cudf::strings::join_list_elements(
+      lcv, separator_scalar, narep_scalar, null_policy, empty_list_output));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_repeatStrings(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_repeatStrings(JNIEnv* env,
+                                                                     jclass,
                                                                      jlong strings_handle,
-                                                                     jint repeat_times) {
+                                                                     jint repeat_times)
+{
   JNI_NULL_CHECK(env, strings_handle, "strings_handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const cv = *reinterpret_cast<cudf::column_view *>(strings_handle);
+    auto const cv       = *reinterpret_cast<cudf::column_view*>(strings_handle);
     auto const strs_col = cudf::strings_column_view(cv);
     return release_as_jlong(cudf::strings::repeat_strings(strs_col, repeat_times));
   }
@@ -2531,69 +2770,76 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_repeatStrings(JNIEnv *env
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_repeatStringsWithColumnRepeatTimes(
-    JNIEnv *env, jclass, jlong strings_handle, jlong repeat_times_handle) {
+  JNIEnv* env, jclass, jlong strings_handle, jlong repeat_times_handle)
+{
   JNI_NULL_CHECK(env, strings_handle, "strings_handle is null", 0);
   JNI_NULL_CHECK(env, repeat_times_handle, "repeat_times_handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const strings_cv = *reinterpret_cast<cudf::column_view *>(strings_handle);
-    auto const strs_col = cudf::strings_column_view(strings_cv);
-    auto const repeat_times_cv = *reinterpret_cast<cudf::column_view *>(repeat_times_handle);
+    auto const strings_cv      = *reinterpret_cast<cudf::column_view*>(strings_handle);
+    auto const strs_col        = cudf::strings_column_view(strings_cv);
+    auto const repeat_times_cv = *reinterpret_cast<cudf::column_view*>(repeat_times_handle);
     return release_as_jlong(cudf::strings::repeat_strings(strs_col, repeat_times_cv));
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_applyBooleanMask(
-    JNIEnv *env, jclass, jlong list_column_handle, jlong boolean_mask_list_column_handle) {
+  JNIEnv* env, jclass, jlong list_column_handle, jlong boolean_mask_list_column_handle)
+{
   JNI_NULL_CHECK(env, list_column_handle, "list handle is null", 0);
   JNI_NULL_CHECK(env, boolean_mask_list_column_handle, "boolean mask handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
 
-    cudf::column_view const *list_column =
-        reinterpret_cast<cudf::column_view const *>(list_column_handle);
+    cudf::column_view const* list_column =
+      reinterpret_cast<cudf::column_view const*>(list_column_handle);
     cudf::lists_column_view const list_view = cudf::lists_column_view(*list_column);
 
-    cudf::column_view const *boolean_mask_list_column =
-        reinterpret_cast<cudf::column_view const *>(boolean_mask_list_column_handle);
+    cudf::column_view const* boolean_mask_list_column =
+      reinterpret_cast<cudf::column_view const*>(boolean_mask_list_column_handle);
     cudf::lists_column_view const boolean_mask_list_view =
-        cudf::lists_column_view(*boolean_mask_list_column);
+      cudf::lists_column_view(*boolean_mask_list_column);
 
     return release_as_jlong(cudf::lists::apply_boolean_mask(list_view, boolean_mask_list_view));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jboolean JNICALL
-Java_ai_rapids_cudf_ColumnView_hasNonEmptyNulls(JNIEnv *env, jclass, jlong column_view_handle) {
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ColumnView_hasNonEmptyNulls(JNIEnv* env,
+                                                                           jclass,
+                                                                           jlong column_view_handle)
+{
   JNI_NULL_CHECK(env, column_view_handle, "column_view handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const *cv = reinterpret_cast<cudf::column_view const *>(column_view_handle);
+    auto const* cv = reinterpret_cast<cudf::column_view const*>(column_view_handle);
     return cudf::has_nonempty_nulls(*cv);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL
-Java_ai_rapids_cudf_ColumnView_purgeNonEmptyNulls(JNIEnv *env, jclass, jlong column_view_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_purgeNonEmptyNulls(JNIEnv* env,
+                                                                          jclass,
+                                                                          jlong column_view_handle)
+{
   JNI_NULL_CHECK(env, column_view_handle, "column_view handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const *cv = reinterpret_cast<cudf::column_view const *>(column_view_handle);
+    auto const* cv = reinterpret_cast<cudf::column_view const*>(column_view_handle);
     return release_as_jlong(cudf::purge_nonempty_nulls(*cv));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_toHex(JNIEnv *env, jclass, jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_toHex(JNIEnv* env, jclass, jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::strings::integers_to_hex(*input));
   }
   CATCH_STD(env, 0);
 }
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/ColumnViewJni.cu b/java/src/main/native/src/ColumnViewJni.cu
index 56aea0b45e2..2dbff923544 100644
--- a/java/src/main/native/src/ColumnViewJni.cu
+++ b/java/src/main/native/src/ColumnViewJni.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <vector>
+#include "ColumnViewJni.hpp"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -29,59 +29,64 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/span.hpp>
+
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+
 #include <thrust/functional.h>
 #include <thrust/logical.h>
 #include <thrust/scan.h>
 #include <thrust/tabulate.h>
 
-#include "ColumnViewJni.hpp"
+#include <vector>
 
 namespace cudf::jni {
 
-std::unique_ptr<cudf::column>
-new_column_with_boolean_column_as_validity(cudf::column_view const &exemplar,
-                                           cudf::column_view const &validity_column) {
+std::unique_ptr<cudf::column> new_column_with_boolean_column_as_validity(
+  cudf::column_view const& exemplar, cudf::column_view const& validity_column)
+{
   CUDF_EXPECTS(validity_column.type().id() == type_id::BOOL8,
                "Validity column must be of type bool");
   CUDF_EXPECTS(validity_column.size() == exemplar.size(),
                "Exemplar and validity columns must have the same size");
 
   auto validity_device_view = cudf::column_device_view::create(validity_column);
-  auto validity_begin = cudf::detail::make_optional_iterator<bool>(
-      *validity_device_view, cudf::nullate::DYNAMIC{validity_column.has_nulls()});
-  auto validity_end = validity_begin + validity_device_view->size();
+  auto validity_begin       = cudf::detail::make_optional_iterator<bool>(
+    *validity_device_view, cudf::nullate::DYNAMIC{validity_column.has_nulls()});
+  auto validity_end            = validity_begin + validity_device_view->size();
   auto [null_mask, null_count] = cudf::detail::valid_if(
-      validity_begin, validity_end,
-      [] __device__(auto optional_bool) { return optional_bool.value_or(false); },
-      cudf::get_default_stream(), rmm::mr::get_current_device_resource());
-  auto const exemplar_without_null_mask = cudf::column_view{
-      exemplar.type(),
-      exemplar.size(),
-      exemplar.head<void>(),
-      nullptr,
-      0,
-      exemplar.offset(),
-      std::vector<cudf::column_view>{exemplar.child_begin(), exemplar.child_end()}};
+    validity_begin,
+    validity_end,
+    [] __device__(auto optional_bool) { return optional_bool.value_or(false); },
+    cudf::get_default_stream(),
+    rmm::mr::get_current_device_resource());
+  auto const exemplar_without_null_mask =
+    cudf::column_view{exemplar.type(),
+                      exemplar.size(),
+                      exemplar.head<void>(),
+                      nullptr,
+                      0,
+                      exemplar.offset(),
+                      std::vector<cudf::column_view>{exemplar.child_begin(), exemplar.child_end()}};
   auto deep_copy = std::make_unique<cudf::column>(exemplar_without_null_mask);
   deep_copy->set_null_mask(std::move(null_mask), null_count);
   return deep_copy;
 }
 
-std::unique_ptr<cudf::column> generate_list_offsets(cudf::column_view const &list_length,
-                                                    rmm::cuda_stream_view stream) {
+std::unique_ptr<cudf::column> generate_list_offsets(cudf::column_view const& list_length,
+                                                    rmm::cuda_stream_view stream)
+{
   CUDF_EXPECTS(list_length.type().id() == cudf::type_id::INT32,
                "Input column does not have type INT32.");
 
   auto const begin_iter = list_length.template begin<cudf::size_type>();
-  auto const end_iter = list_length.template end<cudf::size_type>();
+  auto const end_iter   = list_length.template end<cudf::size_type>();
 
-  auto offsets_column = make_numeric_column(data_type{type_id::INT32}, list_length.size() + 1,
-                                            mask_state::UNALLOCATED, stream);
+  auto offsets_column = make_numeric_column(
+    data_type{type_id::INT32}, list_length.size() + 1, mask_state::UNALLOCATED, stream);
   auto offsets_view = offsets_column->mutable_view();
-  auto d_offsets = offsets_view.template begin<int32_t>();
+  auto d_offsets    = offsets_view.template begin<int32_t>();
 
   thrust::inclusive_scan(rmm::exec_policy(stream), begin_iter, end_iter, d_offsets + 1);
   CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(int32_t), stream));
@@ -97,75 +102,82 @@ namespace {
  * @param list The input list.
  * @return The boolean result indicating if the input list has null elements.
  */
-__device__ bool list_has_nulls(list_device_view list) {
-  return thrust::any_of(thrust::seq, thrust::make_counting_iterator(0),
+__device__ bool list_has_nulls(list_device_view list)
+{
+  return thrust::any_of(thrust::seq,
+                        thrust::make_counting_iterator(0),
                         thrust::make_counting_iterator(list.size()),
                         [&list](auto const idx) { return list.is_null(idx); });
 }
 
-} // namespace
+}  // namespace
 
-void post_process_list_overlap(cudf::column_view const &lhs, cudf::column_view const &rhs,
-                               std::unique_ptr<cudf::column> const &overlap_result,
-                               rmm::cuda_stream_view stream) {
+void post_process_list_overlap(cudf::column_view const& lhs,
+                               cudf::column_view const& rhs,
+                               std::unique_ptr<cudf::column> const& overlap_result,
+                               rmm::cuda_stream_view stream)
+{
   // If both of the input columns do not have nulls, we don't need to do anything here.
   if (!lists_column_view{lhs}.child().has_nulls() && !lists_column_view{rhs}.child().has_nulls()) {
     return;
   }
 
-  auto const overlap_cv = overlap_result->view();
-  auto const lhs_cdv_ptr = column_device_view::create(lhs, stream);
-  auto const rhs_cdv_ptr = column_device_view::create(rhs, stream);
+  auto const overlap_cv      = overlap_result->view();
+  auto const lhs_cdv_ptr     = column_device_view::create(lhs, stream);
+  auto const rhs_cdv_ptr     = column_device_view::create(rhs, stream);
   auto const overlap_cdv_ptr = column_device_view::create(overlap_cv, stream);
 
   // Create a new bitmask to satisfy Spark's arrays_overlap's special behavior.
   auto validity = rmm::device_uvector<bool>(overlap_cv.size(), stream);
-  thrust::tabulate(rmm::exec_policy(stream), validity.begin(), validity.end(),
-                   [lhs = cudf::detail::lists_column_device_view{*lhs_cdv_ptr},
-                    rhs = cudf::detail::lists_column_device_view{*rhs_cdv_ptr},
-                    overlap_result = *overlap_cdv_ptr] __device__(auto const idx) {
-                     if (overlap_result.is_null(idx) ||
-                         overlap_result.template element<bool>(idx)) {
-                       return true;
-                     }
-
-                     // `lhs_list` and `rhs_list` should not be null, otherwise
-                     // `overlap_result[idx]` is null and that has been handled above.
-                     auto const lhs_list = list_device_view{lhs, idx};
-                     auto const rhs_list = list_device_view{rhs, idx};
-
-                     // Only proceed if both lists are non-empty.
-                     if (lhs_list.size() == 0 || rhs_list.size() == 0) {
-                       return true;
-                     }
-
-                     // Only proceed if at least one list has nulls.
-                     if (!list_has_nulls(lhs_list) && !list_has_nulls(rhs_list)) {
-                       return true;
-                     }
-
-                     // Here, the input lists satisfy all the conditions below so we output a
-                     // null:
-                     //  - Both of the input lists have no non-null common element, and
-                     //  - They are both non-empty, and
-                     //  - Either of them contains null elements.
-                     return false;
-                   });
+  thrust::tabulate(
+    rmm::exec_policy(stream),
+    validity.begin(),
+    validity.end(),
+    [lhs            = cudf::detail::lists_column_device_view{*lhs_cdv_ptr},
+     rhs            = cudf::detail::lists_column_device_view{*rhs_cdv_ptr},
+     overlap_result = *overlap_cdv_ptr] __device__(auto const idx) {
+      if (overlap_result.is_null(idx) || overlap_result.template element<bool>(idx)) {
+        return true;
+      }
+
+      // `lhs_list` and `rhs_list` should not be null, otherwise
+      // `overlap_result[idx]` is null and that has been handled above.
+      auto const lhs_list = list_device_view{lhs, idx};
+      auto const rhs_list = list_device_view{rhs, idx};
+
+      // Only proceed if both lists are non-empty.
+      if (lhs_list.size() == 0 || rhs_list.size() == 0) { return true; }
+
+      // Only proceed if at least one list has nulls.
+      if (!list_has_nulls(lhs_list) && !list_has_nulls(rhs_list)) { return true; }
+
+      // Here, the input lists satisfy all the conditions below so we output a
+      // null:
+      //  - Both of the input lists have no non-null common element, and
+      //  - They are both non-empty, and
+      //  - Either of them contains null elements.
+      return false;
+    });
 
   // Create a new nullmask from the validity data.
   auto [new_null_mask, new_null_count] =
-      cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{},
-                             cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    cudf::detail::valid_if(validity.begin(),
+                           validity.end(),
+                           thrust::identity{},
+                           cudf::get_default_stream(),
+                           rmm::mr::get_current_device_resource());
 
   if (new_null_count > 0) {
     // If the `overlap_result` column is nullable, perform `bitmask_and` of its nullmask and the
     // new nullmask.
     if (overlap_cv.nullable()) {
       auto [null_mask, null_count] = cudf::detail::bitmask_and(
-          std::vector<bitmask_type const *>{
-              overlap_cv.null_mask(), static_cast<bitmask_type const *>(new_null_mask.data())},
-          std::vector<cudf::size_type>{0, 0}, overlap_cv.size(), stream,
-          rmm::mr::get_current_device_resource());
+        std::vector<bitmask_type const*>{overlap_cv.null_mask(),
+                                         static_cast<bitmask_type const*>(new_null_mask.data())},
+        std::vector<cudf::size_type>{0, 0},
+        overlap_cv.size(),
+        stream,
+        rmm::mr::get_current_device_resource());
       overlap_result->set_null_mask(std::move(null_mask), null_count);
     } else {
       // Just set the output nullmask as the new nullmask.
@@ -174,30 +186,32 @@ void post_process_list_overlap(cudf::column_view const &lhs, cudf::column_view c
   }
 }
 
-std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view const &input,
-                                                    rmm::cuda_stream_view stream) {
-  if (input.is_empty()) {
-    return empty_like(input.parent());
-  }
+std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view const& input,
+                                                    rmm::cuda_stream_view stream)
+{
+  if (input.is_empty()) { return empty_like(input.parent()); }
 
   auto const child = input.get_sliced_child(stream);
 
   // Generate labels for the input list elements.
   auto labels = rmm::device_uvector<cudf::size_type>(child.size(), stream);
-  cudf::detail::label_segments(input.offsets_begin(), input.offsets_end(), labels.begin(),
-                               labels.end(), stream);
+  cudf::detail::label_segments(
+    input.offsets_begin(), input.offsets_end(), labels.begin(), labels.end(), stream);
 
   // Use `cudf::duplicate_keep_option::KEEP_LAST` so this will produce the desired behavior when
   // being called in `create_map` in spark-rapids.
   // Other options comparing nulls and NaNs are set as all-equal.
-  auto out_columns =
-      cudf::detail::stable_distinct(
-          table_view{{column_view{cudf::device_span<cudf::size_type const>{labels}}, child.child(0),
-                      child.child(1)}}, // input table
-          std::vector<size_type>{0, 1}, // key columns
-          cudf::duplicate_keep_option::KEEP_LAST, cudf::null_equality::EQUAL,
-          cudf::nan_equality::ALL_EQUAL, stream, rmm::mr::get_current_device_resource())
-          ->release();
+  auto out_columns = cudf::detail::stable_distinct(
+                       table_view{{column_view{cudf::device_span<cudf::size_type const>{labels}},
+                                   child.child(0),
+                                   child.child(1)}},  // input table
+                       std::vector<size_type>{0, 1},  // key columns
+                       cudf::duplicate_keep_option::KEEP_LAST,
+                       cudf::null_equality::EQUAL,
+                       cudf::nan_equality::ALL_EQUAL,
+                       stream,
+                       rmm::mr::get_current_device_resource())
+                       ->release();
   auto const out_labels = out_columns.front()->view();
 
   // Assemble a structs column of <out_keys, out_vals>.
@@ -205,20 +219,26 @@ std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view cons
   out_structs_members.emplace_back(std::move(out_columns[1]));
   out_structs_members.emplace_back(std::move(out_columns[2]));
   auto out_structs =
-      cudf::make_structs_column(out_labels.size(), std::move(out_structs_members), 0, {});
+    cudf::make_structs_column(out_labels.size(), std::move(out_structs_members), 0, {});
 
   // Assemble a lists column of structs<out_keys, out_vals>.
-  auto out_offsets = make_numeric_column(data_type{type_to_id<size_type>()}, input.size() + 1,
-                                         mask_state::UNALLOCATED, stream);
+  auto out_offsets = make_numeric_column(
+    data_type{type_to_id<size_type>()}, input.size() + 1, mask_state::UNALLOCATED, stream);
   auto const offsets_begin = out_offsets->mutable_view().template begin<size_type>();
-  auto const labels_begin = out_labels.template begin<size_type>();
-  cudf::detail::labels_to_offsets(labels_begin, labels_begin + out_labels.size(), offsets_begin,
-                                  offsets_begin + out_offsets->size(), stream);
+  auto const labels_begin  = out_labels.template begin<size_type>();
+  cudf::detail::labels_to_offsets(labels_begin,
+                                  labels_begin + out_labels.size(),
+                                  offsets_begin,
+                                  offsets_begin + out_offsets->size(),
+                                  stream);
 
   return cudf::make_lists_column(
-      input.size(), std::move(out_offsets), std::move(out_structs), input.null_count(),
-      cudf::detail::copy_bitmask(input.parent(), stream, rmm::mr::get_current_device_resource()),
-      stream);
+    input.size(),
+    std::move(out_offsets),
+    std::move(out_structs),
+    input.null_count(),
+    cudf::detail::copy_bitmask(input.parent(), stream, rmm::mr::get_current_device_resource()),
+    stream);
 }
 
-} // namespace cudf::jni
+}  // namespace cudf::jni
diff --git a/java/src/main/native/src/ColumnViewJni.hpp b/java/src/main/native/src/ColumnViewJni.hpp
index 12061119402..c9eef0139ea 100644
--- a/java/src/main/native/src/ColumnViewJni.hpp
+++ b/java/src/main/native/src/ColumnViewJni.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf::jni {
@@ -34,9 +35,8 @@ namespace cudf::jni {
  * @param bool_column bool column whose value is to be used as the validity.
  * @return Deep copy of the exemplar, with the replaced validity.
  */
-std::unique_ptr<cudf::column>
-new_column_with_boolean_column_as_validity(cudf::column_view const &exemplar,
-                                           cudf::column_view const &bool_column);
+std::unique_ptr<cudf::column> new_column_with_boolean_column_as_validity(
+  cudf::column_view const& exemplar, cudf::column_view const& bool_column);
 
 /**
  * @brief Generates list offsets with lengths of each list.
@@ -49,9 +49,8 @@ new_column_with_boolean_column_as_validity(cudf::column_view const &exemplar,
  * @param list_length The column represents list lengths.
  * @return The column represents list offsets.
  */
-std::unique_ptr<cudf::column>
-generate_list_offsets(cudf::column_view const &list_length,
-                      rmm::cuda_stream_view stream = cudf::get_default_stream());
+std::unique_ptr<cudf::column> generate_list_offsets(
+  cudf::column_view const& list_length, rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Perform a special treatment for the results of `cudf::lists::have_overlap` to produce the
@@ -71,8 +70,9 @@ generate_list_offsets(cudf::column_view const &list_length,
  * @param rhs The input lists column for the other side.
  * @param overlap_result The result column generated by checking list overlap in cudf.
  */
-void post_process_list_overlap(cudf::column_view const &lhs, cudf::column_view const &rhs,
-                               std::unique_ptr<cudf::column> const &overlap_result,
+void post_process_list_overlap(cudf::column_view const& lhs,
+                               cudf::column_view const& rhs,
+                               std::unique_ptr<cudf::column> const& overlap_result,
                                rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
@@ -88,7 +88,7 @@ void post_process_list_overlap(cudf::column_view const &lhs, cudf::column_view c
  *
  * @return A new list columns in which the elements in each list are distinct by key.
  */
-std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view const &input,
+std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view const& input,
                                                     rmm::cuda_stream_view stream);
 
-} // namespace cudf::jni
+}  // namespace cudf::jni
diff --git a/java/src/main/native/src/CompiledExpression.cpp b/java/src/main/native/src/CompiledExpression.cpp
index 56c96b26200..339204b96e6 100644
--- a/java/src/main/native/src/CompiledExpression.cpp
+++ b/java/src/main/native/src/CompiledExpression.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,10 +14,8 @@
  * limitations under the License.
  */
 
-#include <cstdint>
-#include <memory>
-#include <stdexcept>
-#include <vector>
+#include "cudf_jni_apis.hpp"
+#include "jni_compiled_expr.hpp"
 
 #include <cudf/ast/expressions.hpp>
 #include <cudf/scalar/scalar.hpp>
@@ -25,56 +23,65 @@
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 
-#include "cudf_jni_apis.hpp"
-#include "jni_compiled_expr.hpp"
+#include <cstdint>
+#include <memory>
+#include <stdexcept>
+#include <vector>
 
 namespace {
 
 /** Utility class to read data from the serialized AST buffer generated from Java */
 class jni_serialized_ast {
-  jbyte const *data_ptr;      // pointer to the current entity to deserialize
-  jbyte const *const end_ptr; // pointer to the byte immediately after the AST serialized data
+  jbyte const* data_ptr;       // pointer to the current entity to deserialize
+  jbyte const* const end_ptr;  // pointer to the byte immediately after the AST serialized data
 
   /** Throws an error if there is insufficient space left to read the specified number of bytes */
-  void check_for_eof(std::size_t num_bytes_to_read) {
+  void check_for_eof(std::size_t num_bytes_to_read)
+  {
     if (data_ptr + num_bytes_to_read > end_ptr) {
       throw std::runtime_error("Unexpected end of serialized data");
     }
   }
 
-public:
-  jni_serialized_ast(cudf::jni::native_jbyteArray &jni_data)
-      : data_ptr(jni_data.begin()), end_ptr(jni_data.end()) {}
+ public:
+  jni_serialized_ast(cudf::jni::native_jbyteArray& jni_data)
+    : data_ptr(jni_data.begin()), end_ptr(jni_data.end())
+  {
+  }
 
   /** Returns true if there is no data remaining to be read */
   bool at_eof() { return data_ptr == end_ptr; }
 
   /** Read a byte from the serialized AST data buffer */
-  jbyte read_byte() {
+  jbyte read_byte()
+  {
     check_for_eof(sizeof(jbyte));
     return *data_ptr++;
   }
 
   /** Read a multi-byte value from the serialized AST data buffer */
-  template <typename T> T read() {
+  template <typename T>
+  T read()
+  {
     if constexpr (std::is_same_v<T, std::string>) {
       auto const size = read<cudf::size_type>();
       check_for_eof(size);
-      auto const result = std::string(reinterpret_cast<char const *>(data_ptr), size);
+      auto const result = std::string(reinterpret_cast<char const*>(data_ptr), size);
       data_ptr += size;
       return result;
     } else {
       check_for_eof(sizeof(T));
       // use memcpy since data may be misaligned
       T result;
-      memcpy(reinterpret_cast<jbyte *>(&result), data_ptr, sizeof(T));
+      memcpy(reinterpret_cast<jbyte*>(&result), data_ptr, sizeof(T));
       data_ptr += sizeof(T);
       return result;
     }
   }
 
   /** Decode a libcudf data type from the serialized AST data buffer */
-  cudf::data_type read_cudf_type() {
+  cudf::data_type read_cudf_type()
+  {
     auto const dtype_id = static_cast<cudf::type_id>(read_byte());
     switch (dtype_id) {
       case cudf::type_id::INT8:
@@ -116,10 +123,10 @@ class jni_serialized_ast {
  * NOTE: This must be kept in sync with the NodeType enumeration in AstNode.java!
  */
 enum class jni_serialized_expression_type : int8_t {
-  VALID_LITERAL = 0,
-  NULL_LITERAL = 1,
+  VALID_LITERAL    = 0,
+  NULL_LITERAL     = 1,
   COLUMN_REFERENCE = 2,
-  UNARY_OPERATION = 3,
+  UNARY_OPERATION  = 3,
   BINARY_OPERATION = 4
 };
 
@@ -128,7 +135,8 @@ enum class jni_serialized_expression_type : int8_t {
  * corresponding libcudf AST operator.
  * NOTE: This must be kept in sync with the enumeration in UnaryOperator.java!
  */
-cudf::ast::ast_operator jni_to_unary_operator(jbyte jni_op_value) {
+cudf::ast::ast_operator jni_to_unary_operator(jbyte jni_op_value)
+{
   switch (jni_op_value) {
     case 0: return cudf::ast::ast_operator::IDENTITY;
     case 1: return cudf::ast::ast_operator::IS_NULL;
@@ -166,7 +174,8 @@ cudf::ast::ast_operator jni_to_unary_operator(jbyte jni_op_value) {
  * corresponding libcudf AST operator.
  * NOTE: This must be kept in sync with the enumeration in BinaryOperator.java!
  */
-cudf::ast::ast_operator jni_to_binary_operator(jbyte jni_op_value) {
+cudf::ast::ast_operator jni_to_binary_operator(jbyte jni_op_value)
+{
   switch (jni_op_value) {
     case 0: return cudf::ast::ast_operator::ADD;
     case 1: return cudf::ast::ast_operator::SUB;
@@ -200,7 +209,8 @@ cudf::ast::ast_operator jni_to_binary_operator(jbyte jni_op_value) {
  * corresponding libcudf AST table reference.
  * NOTE: This must be kept in sync with the enumeration in TableReference.java!
  */
-cudf::ast::table_reference jni_to_table_reference(jbyte jni_value) {
+cudf::ast::table_reference jni_to_table_reference(jbyte jni_value)
+{
   switch (jni_value) {
     case 0: return cudf::ast::table_reference::LEFT;
     case 1: return cudf::ast::table_reference::RIGHT;
@@ -211,64 +221,72 @@ cudf::ast::table_reference jni_to_table_reference(jbyte jni_value) {
 /** Functor for type-dispatching the creation of an AST literal */
 struct make_literal {
   /** Construct an AST literal from a numeric value */
-  template <typename T, std::enable_if_t<cudf::is_numeric<T>()> * = nullptr>
-  cudf::ast::literal &operator()(cudf::data_type dtype, bool is_valid,
-                                 cudf::jni::ast::compiled_expr &compiled_expr,
-                                 jni_serialized_ast &jni_ast) {
+  template <typename T, std::enable_if_t<cudf::is_numeric<T>()>* = nullptr>
+  cudf::ast::literal& operator()(cudf::data_type dtype,
+                                 bool is_valid,
+                                 cudf::jni::ast::compiled_expr& compiled_expr,
+                                 jni_serialized_ast& jni_ast)
+  {
     std::unique_ptr<cudf::scalar> scalar_ptr = cudf::make_numeric_scalar(dtype);
     scalar_ptr->set_valid_async(is_valid);
     if (is_valid) {
-      T val = jni_ast.read<T>();
+      T val            = jni_ast.read<T>();
       using ScalarType = cudf::scalar_type_t<T>;
-      static_cast<ScalarType *>(scalar_ptr.get())->set_value(val);
+      static_cast<ScalarType*>(scalar_ptr.get())->set_value(val);
     }
 
-    auto &numeric_scalar = static_cast<cudf::numeric_scalar<T> &>(*scalar_ptr);
+    auto& numeric_scalar = static_cast<cudf::numeric_scalar<T>&>(*scalar_ptr);
     return compiled_expr.add_literal(std::make_unique<cudf::ast::literal>(numeric_scalar),
                                      std::move(scalar_ptr));
   }
 
   /** Construct an AST literal from a timestamp value */
-  template <typename T, std::enable_if_t<cudf::is_timestamp<T>()> * = nullptr>
-  cudf::ast::literal &operator()(cudf::data_type dtype, bool is_valid,
-                                 cudf::jni::ast::compiled_expr &compiled_expr,
-                                 jni_serialized_ast &jni_ast) {
+  template <typename T, std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
+  cudf::ast::literal& operator()(cudf::data_type dtype,
+                                 bool is_valid,
+                                 cudf::jni::ast::compiled_expr& compiled_expr,
+                                 jni_serialized_ast& jni_ast)
+  {
     std::unique_ptr<cudf::scalar> scalar_ptr = cudf::make_timestamp_scalar(dtype);
     scalar_ptr->set_valid_async(is_valid);
     if (is_valid) {
-      T val = jni_ast.read<T>();
+      T val            = jni_ast.read<T>();
       using ScalarType = cudf::scalar_type_t<T>;
-      static_cast<ScalarType *>(scalar_ptr.get())->set_value(val);
+      static_cast<ScalarType*>(scalar_ptr.get())->set_value(val);
     }
 
-    auto &timestamp_scalar = static_cast<cudf::timestamp_scalar<T> &>(*scalar_ptr);
+    auto& timestamp_scalar = static_cast<cudf::timestamp_scalar<T>&>(*scalar_ptr);
     return compiled_expr.add_literal(std::make_unique<cudf::ast::literal>(timestamp_scalar),
                                      std::move(scalar_ptr));
   }
 
   /** Construct an AST literal from a duration value */
-  template <typename T, std::enable_if_t<cudf::is_duration<T>()> * = nullptr>
-  cudf::ast::literal &operator()(cudf::data_type dtype, bool is_valid,
-                                 cudf::jni::ast::compiled_expr &compiled_expr,
-                                 jni_serialized_ast &jni_ast) {
+  template <typename T, std::enable_if_t<cudf::is_duration<T>()>* = nullptr>
+  cudf::ast::literal& operator()(cudf::data_type dtype,
+                                 bool is_valid,
+                                 cudf::jni::ast::compiled_expr& compiled_expr,
+                                 jni_serialized_ast& jni_ast)
+  {
     std::unique_ptr<cudf::scalar> scalar_ptr = cudf::make_duration_scalar(dtype);
     scalar_ptr->set_valid_async(is_valid);
     if (is_valid) {
-      T val = jni_ast.read<T>();
+      T val            = jni_ast.read<T>();
       using ScalarType = cudf::scalar_type_t<T>;
-      static_cast<ScalarType *>(scalar_ptr.get())->set_value(val);
+      static_cast<ScalarType*>(scalar_ptr.get())->set_value(val);
     }
 
-    auto &duration_scalar = static_cast<cudf::duration_scalar<T> &>(*scalar_ptr);
+    auto& duration_scalar = static_cast<cudf::duration_scalar<T>&>(*scalar_ptr);
     return compiled_expr.add_literal(std::make_unique<cudf::ast::literal>(duration_scalar),
                                      std::move(scalar_ptr));
   }
 
   /** Construct an AST literal from a string value */
-  template <typename T, std::enable_if_t<std::is_same_v<T, cudf::string_view>> * = nullptr>
-  cudf::ast::literal &operator()(cudf::data_type dtype, bool is_valid,
-                                 cudf::jni::ast::compiled_expr &compiled_expr,
-                                 jni_serialized_ast &jni_ast) {
+  template <typename T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>* = nullptr>
+  cudf::ast::literal& operator()(cudf::data_type dtype,
+                                 bool is_valid,
+                                 cudf::jni::ast::compiled_expr& compiled_expr,
+                                 jni_serialized_ast& jni_ast)
+  {
     std::unique_ptr<cudf::scalar> scalar_ptr = [&]() {
       if (is_valid) {
         std::string val = jni_ast.read<std::string>();
@@ -278,64 +296,73 @@ struct make_literal {
       }
     }();
 
-    auto &str_scalar = static_cast<cudf::string_scalar &>(*scalar_ptr);
+    auto& str_scalar = static_cast<cudf::string_scalar&>(*scalar_ptr);
     return compiled_expr.add_literal(std::make_unique<cudf::ast::literal>(str_scalar),
                                      std::move(scalar_ptr));
   }
 
   /** Default functor implementation to catch type dispatch errors */
-  template <typename T, std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_timestamp<T>() &&
-                                         !cudf::is_duration<T>() &&
-                                         !std::is_same_v<T, cudf::string_view>> * = nullptr>
-  cudf::ast::literal &operator()(cudf::data_type dtype, bool is_valid,
-                                 cudf::jni::ast::compiled_expr &compiled_expr,
-                                 jni_serialized_ast &jni_ast) {
+  template <
+    typename T,
+    std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_timestamp<T>() &&
+                     !cudf::is_duration<T>() && !std::is_same_v<T, cudf::string_view>>* = nullptr>
+  cudf::ast::literal& operator()(cudf::data_type dtype,
+                                 bool is_valid,
+                                 cudf::jni::ast::compiled_expr& compiled_expr,
+                                 jni_serialized_ast& jni_ast)
+  {
     throw std::logic_error("Unsupported AST literal type");
   }
 };
 
 /** Decode a serialized AST literal */
-cudf::ast::literal &compile_literal(bool is_valid, cudf::jni::ast::compiled_expr &compiled_expr,
-                                    jni_serialized_ast &jni_ast) {
+cudf::ast::literal& compile_literal(bool is_valid,
+                                    cudf::jni::ast::compiled_expr& compiled_expr,
+                                    jni_serialized_ast& jni_ast)
+{
   auto const dtype = jni_ast.read_cudf_type();
   return cudf::type_dispatcher(dtype, make_literal{}, dtype, is_valid, compiled_expr, jni_ast);
 }
 
 /** Decode a serialized AST column reference */
-cudf::ast::column_reference &compile_column_reference(cudf::jni::ast::compiled_expr &compiled_expr,
-                                                      jni_serialized_ast &jni_ast) {
-  auto const table_ref = jni_to_table_reference(jni_ast.read_byte());
+cudf::ast::column_reference& compile_column_reference(cudf::jni::ast::compiled_expr& compiled_expr,
+                                                      jni_serialized_ast& jni_ast)
+{
+  auto const table_ref               = jni_to_table_reference(jni_ast.read_byte());
   cudf::size_type const column_index = jni_ast.read<int>();
   return compiled_expr.add_column_ref(
-      std::make_unique<cudf::ast::column_reference>(column_index, table_ref));
+    std::make_unique<cudf::ast::column_reference>(column_index, table_ref));
 }
 
 // forward declaration
-cudf::ast::expression &compile_expression(cudf::jni::ast::compiled_expr &compiled_expr,
-                                          jni_serialized_ast &jni_ast);
+cudf::ast::expression& compile_expression(cudf::jni::ast::compiled_expr& compiled_expr,
+                                          jni_serialized_ast& jni_ast);
 
 /** Decode a serialized AST unary expression */
-cudf::ast::operation &compile_unary_expression(cudf::jni::ast::compiled_expr &compiled_expr,
-                                               jni_serialized_ast &jni_ast) {
-  auto const ast_op = jni_to_unary_operator(jni_ast.read_byte());
-  cudf::ast::expression &child_expression = compile_expression(compiled_expr, jni_ast);
+cudf::ast::operation& compile_unary_expression(cudf::jni::ast::compiled_expr& compiled_expr,
+                                               jni_serialized_ast& jni_ast)
+{
+  auto const ast_op                       = jni_to_unary_operator(jni_ast.read_byte());
+  cudf::ast::expression& child_expression = compile_expression(compiled_expr, jni_ast);
   return compiled_expr.add_operation(
-      std::make_unique<cudf::ast::operation>(ast_op, child_expression));
+    std::make_unique<cudf::ast::operation>(ast_op, child_expression));
 }
 
 /** Decode a serialized AST binary expression */
-cudf::ast::operation &compile_binary_expression(cudf::jni::ast::compiled_expr &compiled_expr,
-                                                jni_serialized_ast &jni_ast) {
-  auto const ast_op = jni_to_binary_operator(jni_ast.read_byte());
-  cudf::ast::expression &left_child = compile_expression(compiled_expr, jni_ast);
-  cudf::ast::expression &right_child = compile_expression(compiled_expr, jni_ast);
+cudf::ast::operation& compile_binary_expression(cudf::jni::ast::compiled_expr& compiled_expr,
+                                                jni_serialized_ast& jni_ast)
+{
+  auto const ast_op                  = jni_to_binary_operator(jni_ast.read_byte());
+  cudf::ast::expression& left_child  = compile_expression(compiled_expr, jni_ast);
+  cudf::ast::expression& right_child = compile_expression(compiled_expr, jni_ast);
   return compiled_expr.add_operation(
-      std::make_unique<cudf::ast::operation>(ast_op, left_child, right_child));
+    std::make_unique<cudf::ast::operation>(ast_op, left_child, right_child));
 }
 
 /** Decode a serialized AST expression by reading the expression type and dispatching */
-cudf::ast::expression &compile_expression(cudf::jni::ast::compiled_expr &compiled_expr,
-                                          jni_serialized_ast &jni_ast) {
+cudf::ast::expression& compile_expression(cudf::jni::ast::compiled_expr& compiled_expr,
+                                          jni_serialized_ast& jni_ast)
+{
   auto const expression_type = static_cast<jni_serialized_expression_type>(jni_ast.read_byte());
   switch (expression_type) {
     case jni_serialized_expression_type::VALID_LITERAL:
@@ -353,23 +380,24 @@ cudf::ast::expression &compile_expression(cudf::jni::ast::compiled_expr &compile
 }
 
 /** Decode a serialized AST into a native libcudf AST and associated resources */
-std::unique_ptr<cudf::jni::ast::compiled_expr> compile_serialized_ast(jni_serialized_ast &jni_ast) {
+std::unique_ptr<cudf::jni::ast::compiled_expr> compile_serialized_ast(jni_serialized_ast& jni_ast)
+{
   auto jni_expr_ptr = std::make_unique<cudf::jni::ast::compiled_expr>();
   (void)compile_expression(*jni_expr_ptr, jni_ast);
 
-  if (!jni_ast.at_eof()) {
-    throw std::invalid_argument("Extra bytes at end of serialized AST");
-  }
+  if (!jni_ast.at_eof()) { throw std::invalid_argument("Extra bytes at end of serialized AST"); }
 
   return jni_expr_ptr;
 }
 
-} // anonymous namespace
+}  // anonymous namespace
 
 extern "C" {
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ast_CompiledExpression_compile(JNIEnv *env, jclass,
-                                                                           jbyteArray jni_data) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ast_CompiledExpression_compile(JNIEnv* env,
+                                                                           jclass,
+                                                                           jbyteArray jni_data)
+{
   JNI_NULL_CHECK(env, jni_data, "Serialized AST data is null", 0);
   try {
     cudf::jni::auto_set_device(env);
@@ -382,31 +410,34 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ast_CompiledExpression_compile(JNIEn
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ast_CompiledExpression_computeColumn(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ast_CompiledExpression_computeColumn(JNIEnv* env,
                                                                                  jclass,
                                                                                  jlong j_ast,
-                                                                                 jlong j_table) {
+                                                                                 jlong j_table)
+{
   JNI_NULL_CHECK(env, j_ast, "Compiled AST pointer is null", 0);
   JNI_NULL_CHECK(env, j_table, "Table view pointer is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto compiled_expr_ptr = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_ast);
-    auto tview_ptr = reinterpret_cast<cudf::table_view const *>(j_table);
+    auto compiled_expr_ptr = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_ast);
+    auto tview_ptr         = reinterpret_cast<cudf::table_view const*>(j_table);
     std::unique_ptr<cudf::column> result =
-        cudf::compute_column(*tview_ptr, compiled_expr_ptr->get_top_expression());
+      cudf::compute_column(*tview_ptr, compiled_expr_ptr->get_top_expression());
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_ast_CompiledExpression_destroy(JNIEnv *env, jclass,
-                                                                          jlong jni_handle) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ast_CompiledExpression_destroy(JNIEnv* env,
+                                                                          jclass,
+                                                                          jlong jni_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto ptr = reinterpret_cast<cudf::jni::ast::compiled_expr *>(jni_handle);
+    auto ptr = reinterpret_cast<cudf::jni::ast::compiled_expr*>(jni_handle);
     delete ptr;
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/ContiguousTableJni.cpp b/java/src/main/native/src/ContiguousTableJni.cpp
index 8c99c77ca1f..19a10bf25ec 100644
--- a/java/src/main/native/src/ContiguousTableJni.cpp
+++ b/java/src/main/native/src/ContiguousTableJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 namespace {
 
-#define CONTIGUOUS_TABLE_CLASS "ai/rapids/cudf/ContiguousTable"
+#define CONTIGUOUS_TABLE_CLASS                  "ai/rapids/cudf/ContiguousTable"
 #define CONTIGUOUS_TABLE_FACTORY_SIG(param_sig) "(" param_sig ")L" CONTIGUOUS_TABLE_CLASS ";"
 
 jclass Contiguous_table_jclass;
@@ -29,87 +29,85 @@ jclass Contig_split_group_by_result_jclass;
 jfieldID Contig_split_group_by_result_groups_field;
 jfieldID Contig_split_group_by_result_uniq_key_columns_field;
 
-} // anonymous namespace
+}  // anonymous namespace
 
 namespace cudf {
 namespace jni {
 
-bool cache_contiguous_table_jni(JNIEnv *env) {
+bool cache_contiguous_table_jni(JNIEnv* env)
+{
   jclass cls = env->FindClass(CONTIGUOUS_TABLE_CLASS);
-  if (cls == nullptr) {
-    return false;
-  }
+  if (cls == nullptr) { return false; }
 
   From_packed_table_method =
-      env->GetStaticMethodID(cls, "fromPackedTable", CONTIGUOUS_TABLE_FACTORY_SIG("JJJJJ"));
-  if (From_packed_table_method == nullptr) {
-    return false;
-  }
+    env->GetStaticMethodID(cls, "fromPackedTable", CONTIGUOUS_TABLE_FACTORY_SIG("JJJJJ"));
+  if (From_packed_table_method == nullptr) { return false; }
 
   // Convert local reference to global so it cannot be garbage collected.
   Contiguous_table_jclass = static_cast<jclass>(env->NewGlobalRef(cls));
-  if (Contiguous_table_jclass == nullptr) {
-    return false;
-  }
+  if (Contiguous_table_jclass == nullptr) { return false; }
   return true;
 }
 
-void release_contiguous_table_jni(JNIEnv *env) {
+void release_contiguous_table_jni(JNIEnv* env)
+{
   Contiguous_table_jclass = cudf::jni::del_global_ref(env, Contiguous_table_jclass);
 }
 
-bool cache_contig_split_group_by_result_jni(JNIEnv *env) {
+bool cache_contig_split_group_by_result_jni(JNIEnv* env)
+{
   jclass cls = env->FindClass(GROUP_BY_RESULT_CLASS);
-  if (cls == nullptr) {
-    return false;
-  }
+  if (cls == nullptr) { return false; }
 
   Contig_split_group_by_result_groups_field =
-      env->GetFieldID(cls, "groups", "[Lai/rapids/cudf/ContiguousTable;");
-  if (Contig_split_group_by_result_groups_field == nullptr) {
-    return false;
-  }
+    env->GetFieldID(cls, "groups", "[Lai/rapids/cudf/ContiguousTable;");
+  if (Contig_split_group_by_result_groups_field == nullptr) { return false; }
   Contig_split_group_by_result_uniq_key_columns_field =
-      env->GetFieldID(cls, "uniqKeyColumns", "[J");
-  if (Contig_split_group_by_result_uniq_key_columns_field == nullptr) {
-    return false;
-  }
+    env->GetFieldID(cls, "uniqKeyColumns", "[J");
+  if (Contig_split_group_by_result_uniq_key_columns_field == nullptr) { return false; }
 
   // Convert local reference to global so it cannot be garbage collected.
   Contig_split_group_by_result_jclass = static_cast<jclass>(env->NewGlobalRef(cls));
-  if (Contig_split_group_by_result_jclass == nullptr) {
-    return false;
-  }
+  if (Contig_split_group_by_result_jclass == nullptr) { return false; }
   return true;
 }
 
-void release_contig_split_group_by_result_jni(JNIEnv *env) {
+void release_contig_split_group_by_result_jni(JNIEnv* env)
+{
   Contig_split_group_by_result_jclass = del_global_ref(env, Contig_split_group_by_result_jclass);
 }
 
-jobject contig_split_group_by_result_from(JNIEnv *env, jobjectArray &groups) {
+jobject contig_split_group_by_result_from(JNIEnv* env, jobjectArray& groups)
+{
   jobject gbr = env->AllocObject(Contig_split_group_by_result_jclass);
   env->SetObjectField(gbr, Contig_split_group_by_result_groups_field, groups);
   return gbr;
 }
 
-jobject contig_split_group_by_result_from(JNIEnv *env, jobjectArray &groups,
-                                          jlongArray &uniq_key_columns) {
+jobject contig_split_group_by_result_from(JNIEnv* env,
+                                          jobjectArray& groups,
+                                          jlongArray& uniq_key_columns)
+{
   jobject gbr = env->AllocObject(Contig_split_group_by_result_jclass);
   env->SetObjectField(gbr, Contig_split_group_by_result_groups_field, groups);
   env->SetObjectField(gbr, Contig_split_group_by_result_uniq_key_columns_field, uniq_key_columns);
   return gbr;
 }
 
-jobject contiguous_table_from(JNIEnv *env, cudf::packed_columns &split, long row_count) {
-  jlong metadata_address = reinterpret_cast<jlong>(split.metadata.get());
-  jlong data_address = reinterpret_cast<jlong>(split.gpu_data->data());
-  jlong data_size = static_cast<jlong>(split.gpu_data->size());
+jobject contiguous_table_from(JNIEnv* env, cudf::packed_columns& split, long row_count)
+{
+  jlong metadata_address   = reinterpret_cast<jlong>(split.metadata.get());
+  jlong data_address       = reinterpret_cast<jlong>(split.gpu_data->data());
+  jlong data_size          = static_cast<jlong>(split.gpu_data->size());
   jlong rmm_buffer_address = reinterpret_cast<jlong>(split.gpu_data.get());
 
-  jobject contig_table_obj = env->CallStaticObjectMethod(
-      Contiguous_table_jclass, From_packed_table_method, metadata_address, data_address, data_size,
-      rmm_buffer_address, row_count);
+  jobject contig_table_obj = env->CallStaticObjectMethod(Contiguous_table_jclass,
+                                                         From_packed_table_method,
+                                                         metadata_address,
+                                                         data_address,
+                                                         data_size,
+                                                         rmm_buffer_address,
+                                                         row_count);
 
   if (contig_table_obj != nullptr) {
     split.metadata.release();
@@ -119,28 +117,30 @@ jobject contiguous_table_from(JNIEnv *env, cudf::packed_columns &split, long row
   return contig_table_obj;
 }
 
-native_jobjectArray<jobject> contiguous_table_array(JNIEnv *env, jsize length) {
+native_jobjectArray<jobject> contiguous_table_array(JNIEnv* env, jsize length)
+{
   return native_jobjectArray<jobject>(
-      env, env->NewObjectArray(length, Contiguous_table_jclass, nullptr));
+    env, env->NewObjectArray(length, Contiguous_table_jclass, nullptr));
 }
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
 
 extern "C" {
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ContiguousTable_createPackedMetadata(
-    JNIEnv *env, jclass, jlong j_table, jlong j_buffer_addr, jlong j_buffer_length) {
+  JNIEnv* env, jclass, jlong j_table, jlong j_buffer_addr, jlong j_buffer_length)
+{
   JNI_NULL_CHECK(env, j_table, "input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto table = reinterpret_cast<cudf::table_view const *>(j_table);
-    auto data_addr = reinterpret_cast<uint8_t const *>(j_buffer_addr);
-    auto data_size = static_cast<size_t>(j_buffer_length);
+    auto table        = reinterpret_cast<cudf::table_view const*>(j_table);
+    auto data_addr    = reinterpret_cast<uint8_t const*>(j_buffer_addr);
+    auto data_size    = static_cast<size_t>(j_buffer_length);
     auto metadata_ptr = new std::vector<uint8_t>(cudf::pack_metadata(*table, data_addr, data_size));
     return reinterpret_cast<jlong>(metadata_ptr);
   }
   CATCH_STD(env, 0);
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/CuFileJni.cpp b/java/src/main/native/src/CuFileJni.cpp
index ef165281bf9..382d0e6c9f7 100644
--- a/java/src/main/native/src/CuFileJni.cpp
+++ b/java/src/main/native/src/CuFileJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,18 +13,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <cstring>
+#include "cudf_jni_apis.hpp"
+#include "jni_utils.hpp"
+
+#include <cudf/utilities/error.hpp>
 
 #include <cufile.h>
 #include <fcntl.h>
-#include <unistd.h>
-
-#include <cudf/utilities/error.hpp>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <unistd.h>
 
-#include "cudf_jni_apis.hpp"
-#include "jni_utils.hpp"
+#include <cstring>
 
 namespace {
 
@@ -34,10 +34,10 @@ namespace {
  * @param cu_result CUDA driver error code.
  * @return Description for the error.
  */
-char const *GetCuErrorString(CUresult cu_result) {
-  char const *description;
-  if (cuGetErrorName(cu_result, &description) != CUDA_SUCCESS)
-    description = "unknown cuda error";
+char const* GetCuErrorString(CUresult cu_result)
+{
+  char const* description;
+  if (cuGetErrorName(cu_result, &description) != CUDA_SUCCESS) description = "unknown cuda error";
   return description;
 }
 
@@ -49,9 +49,10 @@ char const *GetCuErrorString(CUresult cu_result) {
  * @param error_code Integer error code.
  * @return Description of the error.
  */
-std::string cuFileGetErrorString(int error_code) {
-  return IS_CUFILE_ERR(error_code) ? std::string(CUFILE_ERRSTR(error_code)) :
-                                     std::string(std::strerror(error_code));
+std::string cuFileGetErrorString(int error_code)
+{
+  return IS_CUFILE_ERR(error_code) ? std::string(CUFILE_ERRSTR(error_code))
+                                   : std::string(std::strerror(error_code));
 }
 
 /**
@@ -60,11 +61,10 @@ std::string cuFileGetErrorString(int error_code) {
  * @param status cuFile return status.
  * @return Description of the error.
  */
-std::string cuFileGetErrorString(CUfileError_t status) {
+std::string cuFileGetErrorString(CUfileError_t status)
+{
   std::string error = cuFileGetErrorString(status.err);
-  if (IS_CUDA_ERR(status)) {
-    error.append(".").append(GetCuErrorString(status.cu_err));
-  }
+  if (IS_CUDA_ERR(status)) { error.append(".").append(GetCuErrorString(status.cu_err)); }
   return error;
 }
 
@@ -72,9 +72,10 @@ std::string cuFileGetErrorString(CUfileError_t status) {
  * @brief RAII wrapper for the cuFile driver.
  */
 class cufile_driver {
-public:
+ public:
   /** @brief Construct a new driver instance by opening the cuFile driver. */
-  cufile_driver() {
+  cufile_driver()
+  {
     auto const status = cuFileDriverOpen();
     if (status.err != CU_FILE_SUCCESS) {
       CUDF_FAIL("Failed to initialize cuFile driver: " + cuFileGetErrorString(status));
@@ -82,8 +83,8 @@ class cufile_driver {
   }
 
   // Disable copy (and move) semantics.
-  cufile_driver(cufile_driver const &) = delete;
-  cufile_driver &operator=(cufile_driver const &) = delete;
+  cufile_driver(cufile_driver const&)            = delete;
+  cufile_driver& operator=(cufile_driver const&) = delete;
 
   /** @brief Destroy the driver instance by closing the cuFile driver. */
   ~cufile_driver() { cuFileDriverClose(); }
@@ -91,7 +92,7 @@ class cufile_driver {
 
 /** @brief RAII wrapper for a device buffer used by cuFile. */
 class cufile_buffer {
-public:
+ public:
   /**
    * @brief Construct a new cuFile buffer.
    *
@@ -100,8 +101,9 @@ class cufile_buffer {
    * @param register_buffer Whether to register the buffer with cuFile. This should only be set to
    * true if this buffer is being reused and is 4KiB aligned.
    */
-  cufile_buffer(void *device_pointer, std::size_t size, bool register_buffer = false)
-      : device_pointer_{device_pointer}, size_{size}, register_buffer_{register_buffer} {
+  cufile_buffer(void* device_pointer, std::size_t size, bool register_buffer = false)
+    : device_pointer_{device_pointer}, size_{size}, register_buffer_{register_buffer}
+  {
     if (register_buffer_) {
       auto const status = cuFileBufRegister(device_pointer_, size_, 0);
       if (status.err != CU_FILE_SUCCESS) {
@@ -111,14 +113,13 @@ class cufile_buffer {
   }
 
   // Disable copy (and move) semantics.
-  cufile_buffer(cufile_buffer const &) = delete;
-  cufile_buffer &operator=(cufile_buffer const &) = delete;
+  cufile_buffer(cufile_buffer const&)            = delete;
+  cufile_buffer& operator=(cufile_buffer const&) = delete;
 
   /** @brief Destroy the buffer by de-registering it if necessary. */
-  ~cufile_buffer() {
-    if (register_buffer_) {
-      cuFileBufDeregister(device_pointer_);
-    }
+  ~cufile_buffer()
+  {
+    if (register_buffer_) { cuFileBufDeregister(device_pointer_); }
   }
 
   /**
@@ -126,7 +127,7 @@ class cufile_buffer {
    *
    * @return Pointer to the device buffer.
    */
-  void *device_pointer() const { return device_pointer_; }
+  void* device_pointer() const { return device_pointer_; }
 
   /**
    * @brief Get the size of the underlying device buffer.
@@ -135,9 +136,9 @@ class cufile_buffer {
    */
   std::size_t size() const { return size_; }
 
-private:
+ private:
   /// Pointer to the device buffer.
-  void *device_pointer_;
+  void* device_pointer_;
   /// Size of the device buffer.
   std::size_t size_;
   /// Whether to register the buffer with cuFile.
@@ -146,7 +147,7 @@ class cufile_buffer {
 
 /** @brief RAII wrapper for a file descriptor and the corresponding cuFile handle. */
 class cufile_file {
-public:
+ public:
   /**
    * @brief Construct a file wrapper.
    *
@@ -154,7 +155,8 @@ class cufile_file {
    *
    * @param file_descriptor A valid file descriptor.
    */
-  explicit cufile_file(int file_descriptor) : file_descriptor_{file_descriptor} {
+  explicit cufile_file(int file_descriptor) : file_descriptor_{file_descriptor}
+  {
     CUfileDescr_t cufile_descriptor{CU_FILE_HANDLE_TYPE_OPAQUE_FD, file_descriptor_};
     auto const status = cuFileHandleRegister(&cufile_handle_, &cufile_descriptor);
     if (status.err != CU_FILE_SUCCESS) {
@@ -169,7 +171,8 @@ class cufile_file {
    * @param path Absolute path of the file to read from.
    * @return std::unique_ptr<cufile_file> for reading.
    */
-  static auto make_reader(char const *path) {
+  static auto make_reader(char const* path)
+  {
     auto const file_descriptor = open(path, O_RDONLY | O_DIRECT);
     if (file_descriptor < 0) {
       CUDF_FAIL("Failed to open file to read: " + cuFileGetErrorString(errno));
@@ -183,7 +186,8 @@ class cufile_file {
    * @param path Absolute path of the file to write to.
    * @return std::unique_ptr<cufile_file> for writing.
    */
-  static auto make_writer(char const *path) {
+  static auto make_writer(char const* path)
+  {
     auto const file_descriptor = open(path, O_CREAT | O_WRONLY | O_DIRECT, S_IRUSR | S_IWUSR);
     if (file_descriptor < 0) {
       CUDF_FAIL("Failed to open file to write: " + cuFileGetErrorString(errno));
@@ -192,11 +196,12 @@ class cufile_file {
   }
 
   // Disable copy (and move) semantics.
-  cufile_file(cufile_file const &) = delete;
-  cufile_file &operator=(cufile_file const &) = delete;
+  cufile_file(cufile_file const&)            = delete;
+  cufile_file& operator=(cufile_file const&) = delete;
 
   /** @brief Destroy the file wrapper by de-registering the cuFile handle and closing the file. */
-  ~cufile_file() {
+  ~cufile_file()
+  {
     cuFileHandleDeregister(cufile_handle_);
     close(file_descriptor_);
   }
@@ -207,9 +212,10 @@ class cufile_file {
    * @param buffer Device buffer to read the file content into.
    * @param file_offset Starting offset from which to read the file.
    */
-  void read(cufile_buffer const &buffer, std::size_t file_offset) const {
+  void read(cufile_buffer const& buffer, std::size_t file_offset) const
+  {
     auto const status =
-        cuFileRead(cufile_handle_, buffer.device_pointer(), buffer.size(), file_offset, 0);
+      cuFileRead(cufile_handle_, buffer.device_pointer(), buffer.size(), file_offset, 0);
 
     if (status < 0) {
       if (IS_CUFILE_ERR(status)) {
@@ -230,7 +236,8 @@ class cufile_file {
    * @param size The number of bytes to write.
    * @param file_offset Starting offset from which to write the buffer.
    */
-  void write(cufile_buffer const &buffer, std::size_t size, std::size_t file_offset) {
+  void write(cufile_buffer const& buffer, std::size_t size, std::size_t file_offset)
+  {
     auto const status = cuFileWrite(cufile_handle_, buffer.device_pointer(), size, file_offset, 0);
 
     if (status < 0) {
@@ -252,7 +259,8 @@ class cufile_file {
    * @param size The number of bytes to append.
    * @return The file offset from which the buffer was appended.
    */
-  std::size_t append(cufile_buffer const &buffer, std::size_t size) {
+  std::size_t append(cufile_buffer const& buffer, std::size_t size)
+  {
     struct stat stat_buffer;
     auto const status = fstat(file_descriptor_, &stat_buffer);
     if (status < 0) {
@@ -264,14 +272,14 @@ class cufile_file {
     return file_offset;
   }
 
-private:
+ private:
   /// The underlying file descriptor.
   int file_descriptor_;
   /// The registered cuFile handle.
   CUfileHandle_t cufile_handle_{};
 };
 
-} // anonymous namespace
+}  // anonymous namespace
 
 extern "C" {
 
@@ -281,7 +289,8 @@ extern "C" {
  * @param env The JNI environment.
  * @return Pointer address to the new driver wrapper instance.
  */
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileDriver_create(JNIEnv *env, jclass) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileDriver_create(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     return reinterpret_cast<jlong>(new cufile_driver());
@@ -295,11 +304,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileDriver_create(JNIEnv *env, jcl
  * @param env The JNI environment.
  * @param pointer Pointer address to the driver wrapper instance.
  */
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileDriver_destroy(JNIEnv *env, jclass,
-                                                                jlong pointer) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileDriver_destroy(JNIEnv* env, jclass, jlong pointer)
+{
   try {
     cudf::jni::auto_set_device(env);
-    delete reinterpret_cast<cufile_driver *>(pointer);
+    delete reinterpret_cast<cufile_driver*>(pointer);
   }
   CATCH_STD(env, );
 }
@@ -313,13 +322,13 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileDriver_destroy(JNIEnv *env, jcl
  * @param register_buffer If true, register the cuFile buffer.
  * @return Pointer address to the new buffer wrapper instance.
  */
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileBuffer_create(JNIEnv *env, jclass,
-                                                                jlong device_pointer, jlong size,
-                                                                jboolean register_buffer) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileBuffer_create(
+  JNIEnv* env, jclass, jlong device_pointer, jlong size, jboolean register_buffer)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto *buffer =
-        new cufile_buffer(reinterpret_cast<void *>(device_pointer), size, register_buffer);
+    auto* buffer =
+      new cufile_buffer(reinterpret_cast<void*>(device_pointer), size, register_buffer);
     return reinterpret_cast<jlong>(buffer);
   }
   CATCH_STD(env, 0);
@@ -331,11 +340,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileBuffer_create(JNIEnv *env, jcl
  * @param env The JNI environment.
  * @param pointer Pointer address to the buffer wrapper instance.
  */
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileBuffer_destroy(JNIEnv *env, jclass,
-                                                                jlong pointer) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileBuffer_destroy(JNIEnv* env, jclass, jlong pointer)
+{
   try {
     cudf::jni::auto_set_device(env);
-    delete reinterpret_cast<cufile_buffer *>(pointer);
+    delete reinterpret_cast<cufile_buffer*>(pointer);
   }
   CATCH_STD(env, );
 }
@@ -347,8 +356,10 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileBuffer_destroy(JNIEnv *env, jcl
  * @param path The file path to read from.
  * @return Pointer address to the new file handle wrapper instance.
  */
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileReadHandle_create(JNIEnv *env, jclass,
-                                                                    jstring path) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileReadHandle_create(JNIEnv* env,
+                                                                    jclass,
+                                                                    jstring path)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto file = cufile_file::make_reader(env->GetStringUTFChars(path, nullptr));
@@ -365,14 +376,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileReadHandle_create(JNIEnv *env,
  * @param file_offset The file offset from which to read.
  * @param buffer Pointer to the cuFile buffer object.
  */
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileReadHandle_readIntoBuffer(JNIEnv *env, jclass,
-                                                                           jlong file,
-                                                                           jlong file_offset,
-                                                                           jlong buffer) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileReadHandle_readIntoBuffer(
+  JNIEnv* env, jclass, jlong file, jlong file_offset, jlong buffer)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto *file_ptr = reinterpret_cast<cufile_file *>(file);
-    auto *buffer_ptr = reinterpret_cast<cufile_buffer *>(buffer);
+    auto* file_ptr   = reinterpret_cast<cufile_file*>(file);
+    auto* buffer_ptr = reinterpret_cast<cufile_buffer*>(buffer);
     file_ptr->read(*buffer_ptr, file_offset);
   }
   CATCH_STD(env, );
@@ -385,8 +395,10 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileReadHandle_readIntoBuffer(JNIEn
  * @param path The file path to write to.
  * @return Pointer address to the new file handle wrapper instance.
  */
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_create(JNIEnv *env, jclass,
-                                                                     jstring path) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_create(JNIEnv* env,
+                                                                     jclass,
+                                                                     jstring path)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto file = cufile_file::make_writer(env->GetStringUTFChars(path, nullptr));
@@ -405,11 +417,12 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_create(JNIEnv *env
  * @param size Number of bytes to write.
  */
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_writeFromBuffer(
-    JNIEnv *env, jclass, jlong file, jlong file_offset, jlong buffer, jlong size) {
+  JNIEnv* env, jclass, jlong file, jlong file_offset, jlong buffer, jlong size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto *file_ptr = reinterpret_cast<cufile_file *>(file);
-    auto *buffer_ptr = reinterpret_cast<cufile_buffer *>(buffer);
+    auto* file_ptr   = reinterpret_cast<cufile_file*>(file);
+    auto* buffer_ptr = reinterpret_cast<cufile_buffer*>(buffer);
     file_ptr->write(*buffer_ptr, size, file_offset);
   }
   CATCH_STD(env, );
@@ -424,14 +437,13 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_writeFromBuffer(
  * @param size Number of bytes to append
  * @return The file offset from which the buffer was appended.
  */
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_appendFromBuffer(JNIEnv *env, jclass,
-                                                                              jlong file,
-                                                                              jlong buffer,
-                                                                              jlong size) {
+JNIEXPORT long JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_appendFromBuffer(
+  JNIEnv* env, jclass, jlong file, jlong buffer, jlong size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto *file_ptr = reinterpret_cast<cufile_file *>(file);
-    auto *buffer_ptr = reinterpret_cast<cufile_buffer *>(buffer);
+    auto* file_ptr   = reinterpret_cast<cufile_file*>(file);
+    auto* buffer_ptr = reinterpret_cast<cufile_buffer*>(buffer);
     return file_ptr->append(*buffer_ptr, size);
   }
   CATCH_STD(env, -1);
@@ -443,11 +455,11 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_appendFromBuffer(JN
  * @param env The JNI environment.
  * @param pointer Pointer address to the file handle wrapper instance.
  */
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileHandle_destroy(JNIEnv *env, jclass,
-                                                                jlong pointer) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileHandle_destroy(JNIEnv* env, jclass, jlong pointer)
+{
   try {
     cudf::jni::auto_set_device(env);
-    delete reinterpret_cast<cufile_file *>(pointer);
+    delete reinterpret_cast<cufile_file*>(pointer);
   }
   CATCH_STD(env, );
 }
@@ -461,12 +473,12 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileHandle_destroy(JNIEnv *env, jcl
  * @param device_pointer Pointer address to the device buffer.
  * @param size Number of bytes to write.
  */
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFile_writeToFile(JNIEnv *env, jclass, jstring path,
-                                                              jlong file_offset,
-                                                              jlong device_pointer, jlong size) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFile_writeToFile(
+  JNIEnv* env, jclass, jstring path, jlong file_offset, jlong device_pointer, jlong size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    cufile_buffer buffer{reinterpret_cast<void *>(device_pointer), static_cast<std::size_t>(size)};
+    cufile_buffer buffer{reinterpret_cast<void*>(device_pointer), static_cast<std::size_t>(size)};
     auto writer = cufile_file::make_writer(env->GetStringUTFChars(path, nullptr));
     writer->write(buffer, size, file_offset);
   }
@@ -482,11 +494,12 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFile_writeToFile(JNIEnv *env, jclas
  * @param size Number of bytes to append.
  * @return The file offset from which the buffer was appended.
  */
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFile_appendToFile(JNIEnv *env, jclass, jstring path,
-                                                                jlong device_pointer, jlong size) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFile_appendToFile(
+  JNIEnv* env, jclass, jstring path, jlong device_pointer, jlong size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    cufile_buffer buffer{reinterpret_cast<void *>(device_pointer), static_cast<std::size_t>(size)};
+    cufile_buffer buffer{reinterpret_cast<void*>(device_pointer), static_cast<std::size_t>(size)};
     auto writer = cufile_file::make_writer(env->GetStringUTFChars(path, nullptr));
     return writer->append(buffer, size);
   }
@@ -502,16 +515,16 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFile_appendToFile(JNIEnv *env, jcl
  * @param path Absolute path of the file to copy from.
  * @param file_offset The file offset from which to copy content.
  */
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFile_readFromFile(JNIEnv *env, jclass,
-                                                               jlong device_pointer, jlong size,
-                                                               jstring path, jlong file_offset) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFile_readFromFile(
+  JNIEnv* env, jclass, jlong device_pointer, jlong size, jstring path, jlong file_offset)
+{
   try {
     cudf::jni::auto_set_device(env);
-    cufile_buffer buffer{reinterpret_cast<void *>(device_pointer), static_cast<std::size_t>(size)};
+    cufile_buffer buffer{reinterpret_cast<void*>(device_pointer), static_cast<std::size_t>(size)};
     auto const reader = cufile_file::make_reader(env->GetStringUTFChars(path, nullptr));
     reader->read(buffer, file_offset);
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/CudaJni.cpp b/java/src/main/native/src/CudaJni.cpp
index 2fe550cdfeb..127a750db43 100644
--- a/java/src/main/native/src/CudaJni.cpp
+++ b/java/src/main/native/src/CudaJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/utilities/error.hpp>
+
 #include <rmm/device_buffer.hpp>
 
 #ifdef CUDF_JNI_ENABLE_PROFILING
@@ -30,21 +31,20 @@ int Cudf_device{cudaInvalidDeviceId};
 
 thread_local int Thread_device = cudaInvalidDeviceId;
 
-} // anonymous namespace
+}  // anonymous namespace
 
 namespace cudf {
 namespace jni {
 
 /** Set the device to use for cudf */
-void set_cudf_device(int device) {
-  Cudf_device = device;
-}
+void set_cudf_device(int device) { Cudf_device = device; }
 
 /**
  * If a cudf device has been specified then this ensures the calling thread
  * is using the same device.
  */
-void auto_set_device(JNIEnv *env) {
+void auto_set_device(JNIEnv* env)
+{
   if (Cudf_device != cudaInvalidDeviceId) {
     if (Thread_device != Cudf_device) {
       cudaError_t cuda_status = cudaSetDevice(Cudf_device);
@@ -55,17 +55,19 @@ void auto_set_device(JNIEnv *env) {
 }
 
 /** Fills all the bytes in the buffer 'buf' with 'value'. */
-void device_memset_async(JNIEnv *env, rmm::device_buffer &buf, char value) {
-  cudaError_t cuda_status = cudaMemsetAsync((void *)buf.data(), value, buf.size());
+void device_memset_async(JNIEnv* env, rmm::device_buffer& buf, char value)
+{
+  cudaError_t cuda_status = cudaMemsetAsync((void*)buf.data(), value, buf.size());
   jni_cuda_check(env, cuda_status);
 }
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
 
 extern "C" {
 
-JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Cuda_memGetInfo(JNIEnv *env, jclass clazz) {
+JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Cuda_memGetInfo(JNIEnv* env, jclass clazz)
+{
   try {
     cudf::jni::auto_set_device(env);
 
@@ -73,14 +75,10 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Cuda_memGetInfo(JNIEnv *env, jclas
     CUDF_CUDA_TRY(cudaMemGetInfo(&free, &total));
 
     jclass info_class = env->FindClass("Lai/rapids/cudf/CudaMemInfo;");
-    if (info_class == NULL) {
-      return NULL;
-    }
+    if (info_class == NULL) { return NULL; }
 
     jmethodID ctor_id = env->GetMethodID(info_class, "<init>", "(JJ)V");
-    if (ctor_id == NULL) {
-      return NULL;
-    }
+    if (ctor_id == NULL) { return NULL; }
 
     jobject info_obj = env->NewObject(info_class, ctor_id, (jlong)free, (jlong)total);
     // No need to check for exceptions of null return value as we are just handing the object back
@@ -90,46 +88,51 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Cuda_memGetInfo(JNIEnv *env, jclas
   CATCH_STD(env, nullptr);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_hostAllocPinned(JNIEnv *env, jclass, jlong size) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_hostAllocPinned(JNIEnv* env, jclass, jlong size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    void *ret = nullptr;
+    void* ret = nullptr;
     CUDF_CUDA_TRY(cudaMallocHost(&ret, size));
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freePinned(JNIEnv *env, jclass, jlong ptr) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freePinned(JNIEnv* env, jclass, jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    CUDF_CUDA_TRY(cudaFreeHost(reinterpret_cast<void *>(ptr)));
+    CUDF_CUDA_TRY(cudaFreeHost(reinterpret_cast<void*>(ptr)));
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memset(JNIEnv *env, jclass, jlong dst, jbyte value,
-                                                       jlong count, jint kind) {
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_Cuda_memset(JNIEnv* env, jclass, jlong dst, jbyte value, jlong count, jint kind)
+{
   JNI_NULL_CHECK(env, dst, "dst memory pointer is null", );
   try {
     cudf::jni::auto_set_device(env);
-    CUDF_CUDA_TRY(cudaMemsetAsync((void *)dst, value, count));
+    CUDF_CUDA_TRY(cudaMemsetAsync((void*)dst, value, count));
     CUDF_CUDA_TRY(cudaStreamSynchronize(0));
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemset(JNIEnv *env, jclass, jlong dst,
-                                                            jbyte value, jlong count, jint kind) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemset(
+  JNIEnv* env, jclass, jlong dst, jbyte value, jlong count, jint kind)
+{
   JNI_NULL_CHECK(env, dst, "dst memory pointer is null", );
   try {
     cudf::jni::auto_set_device(env);
-    CUDF_CUDA_TRY(cudaMemsetAsync((void *)dst, value, count));
+    CUDF_CUDA_TRY(cudaMemsetAsync((void*)dst, value, count));
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDevice(JNIEnv *env, jclass) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDevice(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     jint dev;
@@ -139,7 +142,8 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDevice(JNIEnv *env, jclass) {
   CATCH_STD(env, -2);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDeviceCount(JNIEnv *env, jclass) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDeviceCount(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     jint count;
@@ -149,25 +153,28 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDeviceCount(JNIEnv *env, jcla
   CATCH_STD(env, -2);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_setDevice(JNIEnv *env, jclass, jint dev) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_setDevice(JNIEnv* env, jclass, jint dev)
+{
   try {
     if (Cudf_device != cudaInvalidDeviceId && dev != Cudf_device) {
-      cudf::jni::throw_java_exception(env, cudf::jni::CUDF_ERROR_CLASS,
-                                      "Cannot change device after RMM init");
+      cudf::jni::throw_java_exception(
+        env, cudf::jni::CUDF_ERROR_CLASS, "Cannot change device after RMM init");
     }
     CUDF_CUDA_TRY(cudaSetDevice(dev));
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_autoSetDevice(JNIEnv *env, jclass, jint dev) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_autoSetDevice(JNIEnv* env, jclass, jint dev)
+{
   try {
     cudf::jni::auto_set_device(env);
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDriverVersion(JNIEnv *env, jclass) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDriverVersion(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     jint driver_version;
@@ -177,7 +184,8 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDriverVersion(JNIEnv *env, jc
   CATCH_STD(env, -2);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getRuntimeVersion(JNIEnv *env, jclass) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getRuntimeVersion(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     jint runtime_version;
@@ -187,7 +195,8 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getRuntimeVersion(JNIEnv *env, j
   CATCH_STD(env, -2);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getNativeComputeMode(JNIEnv *env, jclass) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getNativeComputeMode(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     int device;
@@ -199,33 +208,36 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getNativeComputeMode(JNIEnv *env
   CATCH_STD(env, -2);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMajor(JNIEnv *env, jclass) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMajor(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     int device;
     CUDF_CUDA_TRY(::cudaGetDevice(&device));
     int attribute_value;
     CUDF_CUDA_TRY(
-        ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMajor, device));
+      ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMajor, device));
     return attribute_value;
   }
   CATCH_STD(env, -2);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMinor(JNIEnv *env, jclass) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMinor(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     int device;
     CUDF_CUDA_TRY(::cudaGetDevice(&device));
     int attribute_value;
     CUDF_CUDA_TRY(
-        ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMinor, device));
+      ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMinor, device));
     return attribute_value;
   }
   CATCH_STD(env, -2);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freeZero(JNIEnv *env, jclass) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freeZero(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     CUDF_CUDA_TRY(cudaFree(0));
@@ -233,19 +245,22 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freeZero(JNIEnv *env, jclass) {
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createStream(JNIEnv *env, jclass,
-                                                              jboolean isNonBlocking) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createStream(JNIEnv* env,
+                                                              jclass,
+                                                              jboolean isNonBlocking)
+{
   try {
     cudf::jni::auto_set_device(env);
     cudaStream_t stream = nullptr;
-    auto flags = isNonBlocking ? cudaStreamNonBlocking : cudaStreamDefault;
+    auto flags          = isNonBlocking ? cudaStreamNonBlocking : cudaStreamDefault;
     CUDF_CUDA_TRY(cudaStreamCreateWithFlags(&stream, flags));
     return reinterpret_cast<jlong>(stream);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyStream(JNIEnv *env, jclass, jlong jstream) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyStream(JNIEnv* env, jclass, jlong jstream)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto stream = reinterpret_cast<cudaStream_t>(jstream);
@@ -254,19 +269,24 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyStream(JNIEnv *env, jclas
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamWaitEvent(JNIEnv *env, jclass, jlong jstream,
-                                                                jlong jevent) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamWaitEvent(JNIEnv* env,
+                                                                jclass,
+                                                                jlong jstream,
+                                                                jlong jevent)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    auto event = reinterpret_cast<cudaEvent_t>(jevent);
+    auto event  = reinterpret_cast<cudaEvent_t>(jevent);
     CUDF_CUDA_TRY(cudaStreamWaitEvent(stream, event, 0));
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamSynchronize(JNIEnv *env, jclass,
-                                                                  jlong jstream) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamSynchronize(JNIEnv* env,
+                                                                  jclass,
+                                                                  jlong jstream)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto stream = reinterpret_cast<cudaStream_t>(jstream);
@@ -275,26 +295,25 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamSynchronize(JNIEnv *env, j
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createEvent(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createEvent(JNIEnv* env,
+                                                             jclass,
                                                              jboolean enableTiming,
-                                                             jboolean blockingSync) {
+                                                             jboolean blockingSync)
+{
   try {
     cudf::jni::auto_set_device(env);
-    cudaEvent_t event = nullptr;
+    cudaEvent_t event  = nullptr;
     unsigned int flags = 0;
-    if (!enableTiming) {
-      flags = flags | cudaEventDisableTiming;
-    }
-    if (blockingSync) {
-      flags = flags | cudaEventBlockingSync;
-    }
+    if (!enableTiming) { flags = flags | cudaEventDisableTiming; }
+    if (blockingSync) { flags = flags | cudaEventBlockingSync; }
     CUDF_CUDA_TRY(cudaEventCreateWithFlags(&event, flags));
     return reinterpret_cast<jlong>(event);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyEvent(JNIEnv *env, jclass, jlong jevent) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyEvent(JNIEnv* env, jclass, jlong jevent)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto event = reinterpret_cast<cudaEvent_t>(jevent);
@@ -303,35 +322,39 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyEvent(JNIEnv *env, jclass
   CATCH_STD(env, );
 }
 
-JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_eventQuery(JNIEnv *env, jclass, jlong jevent) {
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_eventQuery(JNIEnv* env, jclass, jlong jevent)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto event = reinterpret_cast<cudaEvent_t>(jevent);
+    auto event  = reinterpret_cast<cudaEvent_t>(jevent);
     auto result = cudaEventQuery(event);
     if (result == cudaSuccess) {
       return true;
     } else if (result == cudaErrorNotReady) {
       return false;
-    } // else
+    }  // else
     CUDF_CUDA_TRY(result);
   }
   CATCH_STD(env, false);
   return false;
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventRecord(JNIEnv *env, jclass, jlong jevent,
-                                                            jlong jstream) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventRecord(JNIEnv* env,
+                                                            jclass,
+                                                            jlong jevent,
+                                                            jlong jstream)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto event = reinterpret_cast<cudaEvent_t>(jevent);
+    auto event  = reinterpret_cast<cudaEvent_t>(jevent);
     auto stream = reinterpret_cast<cudaStream_t>(jstream);
     CUDF_CUDA_TRY(cudaEventRecord(event, stream));
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventSynchronize(JNIEnv *env, jclass,
-                                                                 jlong jevent) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventSynchronize(JNIEnv* env, jclass, jlong jevent)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto event = reinterpret_cast<cudaEvent_t>(jevent);
@@ -340,19 +363,17 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventSynchronize(JNIEnv *env, jc
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memcpyOnStream(JNIEnv *env, jclass, jlong jdst,
-                                                               jlong jsrc, jlong count, jint jkind,
-                                                               jlong jstream) {
-  if (count == 0) {
-    return;
-  }
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memcpyOnStream(
+  JNIEnv* env, jclass, jlong jdst, jlong jsrc, jlong count, jint jkind, jlong jstream)
+{
+  if (count == 0) { return; }
   JNI_ARG_CHECK(env, jdst != 0, "dst memory pointer is null", );
   JNI_ARG_CHECK(env, jsrc != 0, "src memory pointer is null", );
   try {
     cudf::jni::auto_set_device(env);
-    auto dst = reinterpret_cast<void *>(jdst);
-    auto src = reinterpret_cast<void *>(jsrc);
-    auto kind = static_cast<cudaMemcpyKind>(jkind);
+    auto dst    = reinterpret_cast<void*>(jdst);
+    auto src    = reinterpret_cast<void*>(jsrc);
+    auto kind   = static_cast<cudaMemcpyKind>(jkind);
     auto stream = reinterpret_cast<cudaStream_t>(jstream);
     CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, count, kind, stream));
     CUDF_CUDA_TRY(cudaStreamSynchronize(stream));
@@ -360,50 +381,51 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memcpyOnStream(JNIEnv *env, jcla
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemcpyOnStream(JNIEnv *env, jclass, jlong jdst,
-                                                                    jlong jsrc, jlong count,
-                                                                    jint jkind, jlong jstream) {
-  if (count == 0) {
-    return;
-  }
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemcpyOnStream(
+  JNIEnv* env, jclass, jlong jdst, jlong jsrc, jlong count, jint jkind, jlong jstream)
+{
+  if (count == 0) { return; }
   JNI_ARG_CHECK(env, jdst != 0, "dst memory pointer is null", );
   JNI_ARG_CHECK(env, jsrc != 0, "src memory pointer is null", );
   try {
     cudf::jni::auto_set_device(env);
-    auto dst = reinterpret_cast<void *>(jdst);
-    auto src = reinterpret_cast<void *>(jsrc);
-    auto kind = static_cast<cudaMemcpyKind>(jkind);
+    auto dst    = reinterpret_cast<void*>(jdst);
+    auto src    = reinterpret_cast<void*>(jsrc);
+    auto kind   = static_cast<cudaMemcpyKind>(jkind);
     auto stream = reinterpret_cast<cudaStream_t>(jstream);
     CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, count, kind, stream));
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_profilerStart(JNIEnv *env, jclass clazz) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_profilerStart(JNIEnv* env, jclass clazz)
+{
 #ifdef CUDF_JNI_ENABLE_PROFILING
   try {
     cudaProfilerStart();
   }
   CATCH_STD(env, );
 #else
-  cudf::jni::throw_java_exception(env, cudf::jni::CUDF_ERROR_CLASS,
-                                  "This library was built without CUDA profiler support.");
+  cudf::jni::throw_java_exception(
+    env, cudf::jni::CUDF_ERROR_CLASS, "This library was built without CUDA profiler support.");
 #endif
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_profilerStop(JNIEnv *env, jclass clazz) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_profilerStop(JNIEnv* env, jclass clazz)
+{
 #ifdef CUDF_JNI_ENABLE_PROFILING
   try {
     cudaProfilerStop();
   }
   CATCH_STD(env, );
 #else
-  cudf::jni::throw_java_exception(env, cudf::jni::CUDF_ERROR_CLASS,
-                                  "This library was built without CUDA profiler support.");
+  cudf::jni::throw_java_exception(
+    env, cudf::jni::CUDF_ERROR_CLASS, "This library was built without CUDA profiler support.");
 #endif
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_deviceSynchronize(JNIEnv *env, jclass clazz) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_deviceSynchronize(JNIEnv* env, jclass clazz)
+{
   try {
     cudf::jni::auto_set_device(env);
     CUDF_CUDA_TRY(cudaDeviceSynchronize());
@@ -411,4 +433,4 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_deviceSynchronize(JNIEnv *env, j
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/CudfJni.cpp b/java/src/main/native/src/CudfJni.cpp
index d0a25d449a6..698a8f6ff02 100644
--- a/java/src/main/native/src/CudfJni.cpp
+++ b/java/src/main/native/src/CudfJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,27 +14,27 @@
  * limitations under the License.
  */
 
-#include <sstream>
+#include "cudf_jni_apis.hpp"
 
 #include <cudf/copying.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-#include "cudf_jni_apis.hpp"
+#include <sstream>
 
 namespace {
 
 // handles detaching a thread from the JVM when the thread terminates
 class jvm_detach_on_destruct {
-public:
-  explicit jvm_detach_on_destruct(JavaVM *jvm) : jvm{jvm} {}
+ public:
+  explicit jvm_detach_on_destruct(JavaVM* jvm) : jvm{jvm} {}
 
   ~jvm_detach_on_destruct() { jvm->DetachCurrentThread(); }
 
-private:
-  JavaVM *jvm;
+ private:
+  JavaVM* jvm;
 };
 
-} // anonymous namespace
+}  // anonymous namespace
 
 namespace cudf {
 namespace jni {
@@ -49,74 +49,70 @@ static jclass Host_memory_buffer_jclass;
 static jfieldID Host_buffer_address;
 static jfieldID Host_buffer_length;
 
-#define HOST_MEMORY_BUFFER_CLASS "ai/rapids/cudf/HostMemoryBuffer"
+#define HOST_MEMORY_BUFFER_CLASS          "ai/rapids/cudf/HostMemoryBuffer"
 #define HOST_MEMORY_BUFFER_SIG(param_sig) "(" param_sig ")L" HOST_MEMORY_BUFFER_CLASS ";"
 
-static bool cache_host_memory_buffer_jni(JNIEnv *env) {
+static bool cache_host_memory_buffer_jni(JNIEnv* env)
+{
   jclass cls = env->FindClass(HOST_MEMORY_BUFFER_CLASS);
-  if (cls == nullptr) {
-    return false;
-  }
+  if (cls == nullptr) { return false; }
 
   Host_buffer_address = env->GetFieldID(cls, "address", "J");
-  if (Host_buffer_address == nullptr) {
-    return false;
-  }
+  if (Host_buffer_address == nullptr) { return false; }
 
   Host_buffer_length = env->GetFieldID(cls, "length", "J");
-  if (Host_buffer_length == nullptr) {
-    return false;
-  }
+  if (Host_buffer_length == nullptr) { return false; }
 
   // Convert local reference to global so it cannot be garbage collected.
   Host_memory_buffer_jclass = static_cast<jclass>(env->NewGlobalRef(cls));
-  if (Host_memory_buffer_jclass == nullptr) {
-    return false;
-  }
+  if (Host_memory_buffer_jclass == nullptr) { return false; }
   return true;
 }
 
-static void release_host_memory_buffer_jni(JNIEnv *env) {
+static void release_host_memory_buffer_jni(JNIEnv* env)
+{
   Host_memory_buffer_jclass = del_global_ref(env, Host_memory_buffer_jclass);
 }
 
-jobject allocate_host_buffer(JNIEnv *env, jlong amount, jboolean prefer_pinned,
-                             jobject host_memory_allocator) {
+jobject allocate_host_buffer(JNIEnv* env,
+                             jlong amount,
+                             jboolean prefer_pinned,
+                             jobject host_memory_allocator)
+{
   auto const host_memory_allocator_class = env->GetObjectClass(host_memory_allocator);
   auto const allocateMethodId =
-      env->GetMethodID(host_memory_allocator_class, "allocate", HOST_MEMORY_BUFFER_SIG("JZ"));
+    env->GetMethodID(host_memory_allocator_class, "allocate", HOST_MEMORY_BUFFER_SIG("JZ"));
   jobject ret =
-      env->CallObjectMethod(host_memory_allocator, allocateMethodId, amount, prefer_pinned);
+    env->CallObjectMethod(host_memory_allocator, allocateMethodId, amount, prefer_pinned);
 
-  if (env->ExceptionCheck()) {
-    throw std::runtime_error("allocateHostBuffer threw an exception");
-  }
+  if (env->ExceptionCheck()) { throw std::runtime_error("allocateHostBuffer threw an exception"); }
   return ret;
 }
 
-jlong get_host_buffer_address(JNIEnv *env, jobject buffer) {
+jlong get_host_buffer_address(JNIEnv* env, jobject buffer)
+{
   return env->GetLongField(buffer, Host_buffer_address);
 }
 
-jlong get_host_buffer_length(JNIEnv *env, jobject buffer) {
+jlong get_host_buffer_length(JNIEnv* env, jobject buffer)
+{
   return env->GetLongField(buffer, Host_buffer_length);
 }
 
 // Get the JNI environment, attaching the current thread to the JVM if necessary. If the thread
 // needs to be attached, the thread will automatically detach when the thread terminates.
-JNIEnv *get_jni_env(JavaVM *jvm) {
-  JNIEnv *env = nullptr;
-  jint rc = jvm->GetEnv(reinterpret_cast<void **>(&env), MINIMUM_JNI_VERSION);
-  if (rc == JNI_OK) {
-    return env;
-  }
+JNIEnv* get_jni_env(JavaVM* jvm)
+{
+  JNIEnv* env = nullptr;
+  jint rc     = jvm->GetEnv(reinterpret_cast<void**>(&env), MINIMUM_JNI_VERSION);
+  if (rc == JNI_OK) { return env; }
   if (rc == JNI_EDETACHED) {
     JavaVMAttachArgs attach_args;
     attach_args.version = MINIMUM_JNI_VERSION;
-    attach_args.name = const_cast<char *>("cudf thread");
-    attach_args.group = NULL;
+    attach_args.name    = const_cast<char*>("cudf thread");
+    attach_args.group   = NULL;
 
-    if (jvm->AttachCurrentThreadAsDaemon(reinterpret_cast<void **>(&env), &attach_args) == JNI_OK) {
+    if (jvm->AttachCurrentThreadAsDaemon(reinterpret_cast<void**>(&env), &attach_args) == JNI_OK) {
       // use thread_local object to detach the thread from the JVM when thread terminates.
       thread_local jvm_detach_on_destruct detacher(jvm);
     } else {
@@ -129,14 +125,15 @@ JNIEnv *get_jni_env(JavaVM *jvm) {
   throw std::runtime_error("error detecting thread attach state with JVM");
 }
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
 
 extern "C" {
 
-JNIEXPORT jint JNI_OnLoad(JavaVM *vm, void *) {
-  JNIEnv *env;
-  if (vm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+JNIEXPORT jint JNI_OnLoad(JavaVM* vm, void*)
+{
+  JNIEnv* env;
+  if (vm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
     return JNI_ERR;
   }
 
@@ -186,9 +183,10 @@ JNIEXPORT jint JNI_OnLoad(JavaVM *vm, void *) {
   return cudf::jni::MINIMUM_JNI_VERSION;
 }
 
-JNIEXPORT void JNI_OnUnload(JavaVM *vm, void *) {
-  JNIEnv *env = nullptr;
-  if (vm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+JNIEXPORT void JNI_OnUnload(JavaVM* vm, void*)
+{
+  JNIEnv* env = nullptr;
+  if (vm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
     return;
   }
 
@@ -198,8 +196,9 @@ JNIEXPORT void JNI_OnUnload(JavaVM *vm, void *) {
   cudf::jni::release_host_memory_buffer_jni(env);
 }
 
-JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_isPtdsEnabled(JNIEnv *env, jclass) {
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_isPtdsEnabled(JNIEnv* env, jclass)
+{
   return cudf::jni::is_ptds_enabled;
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/DataSourceHelperJni.cpp b/java/src/main/native/src/DataSourceHelperJni.cpp
index 8d0e4d36413..af064a4f428 100644
--- a/java/src/main/native/src/DataSourceHelperJni.cpp
+++ b/java/src/main/native/src/DataSourceHelperJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
-#include <cudf/io/datasource.hpp>
-
 #include "cudf_jni_apis.hpp"
 #include "jni_utils.hpp"
 
+#include <cudf/io/datasource.hpp>
+
 namespace {
 
 #define DATA_SOURCE_CLASS "ai/rapids/cudf/DataSource"
@@ -29,136 +29,127 @@ jmethodID hostReadBuff_method;
 jmethodID onHostBufferDone_method;
 jmethodID deviceRead_method;
 
-} // anonymous namespace
+}  // anonymous namespace
 
 namespace cudf {
 namespace jni {
-bool cache_data_source_jni(JNIEnv *env) {
+bool cache_data_source_jni(JNIEnv* env)
+{
   jclass cls = env->FindClass(DATA_SOURCE_CLASS);
-  if (cls == nullptr) {
-    return false;
-  }
+  if (cls == nullptr) { return false; }
 
   hostRead_method = env->GetMethodID(cls, "hostRead", "(JJJ)J");
-  if (hostRead_method == nullptr) {
-    return false;
-  }
+  if (hostRead_method == nullptr) { return false; }
 
   hostReadBuff_method = env->GetMethodID(cls, "hostReadBuff", "(JJ)[J");
-  if (hostReadBuff_method == nullptr) {
-    return false;
-  }
+  if (hostReadBuff_method == nullptr) { return false; }
 
   onHostBufferDone_method = env->GetMethodID(cls, "onHostBufferDone", "(J)V");
-  if (onHostBufferDone_method == nullptr) {
-    return false;
-  }
+  if (onHostBufferDone_method == nullptr) { return false; }
 
   deviceRead_method = env->GetMethodID(cls, "deviceRead", "(JJJJ)J");
-  if (deviceRead_method == nullptr) {
-    return false;
-  }
+  if (deviceRead_method == nullptr) { return false; }
 
   // Convert local reference to global so it cannot be garbage collected.
   DataSource_jclass = static_cast<jclass>(env->NewGlobalRef(cls));
-  if (DataSource_jclass == nullptr) {
-    return false;
-  }
+  if (DataSource_jclass == nullptr) { return false; }
   return true;
 }
 
-void release_data_source_jni(JNIEnv *env) {
+void release_data_source_jni(JNIEnv* env)
+{
   DataSource_jclass = cudf::jni::del_global_ref(env, DataSource_jclass);
 }
 
 class host_buffer_done_callback {
-public:
-  explicit host_buffer_done_callback(JavaVM *jvm, jobject ds, long id) : jvm(jvm), ds(ds), id(id) {}
+ public:
+  explicit host_buffer_done_callback(JavaVM* jvm, jobject ds, long id) : jvm(jvm), ds(ds), id(id) {}
 
-  host_buffer_done_callback(host_buffer_done_callback const &other) = delete;
-  host_buffer_done_callback(host_buffer_done_callback &&other)
-      : jvm(other.jvm), ds(other.ds), id(other.id) {
+  host_buffer_done_callback(host_buffer_done_callback const& other) = delete;
+  host_buffer_done_callback(host_buffer_done_callback&& other)
+    : jvm(other.jvm), ds(other.ds), id(other.id)
+  {
     other.jvm = nullptr;
-    other.ds = nullptr;
-    other.id = -1;
+    other.ds  = nullptr;
+    other.id  = -1;
   }
 
-  host_buffer_done_callback &operator=(host_buffer_done_callback &&other) = delete;
-  host_buffer_done_callback &operator=(host_buffer_done_callback const &other) = delete;
+  host_buffer_done_callback& operator=(host_buffer_done_callback&& other)      = delete;
+  host_buffer_done_callback& operator=(host_buffer_done_callback const& other) = delete;
 
-  ~host_buffer_done_callback() {
+  ~host_buffer_done_callback()
+  {
     // because we are in a destructor we cannot throw an exception, so for now we are
     // just going to keep the java exceptions around and have them be thrown when this
     // thread returns to the JVM. It might be kind of confusing, but we will not lose
     // them.
     if (jvm != nullptr) {
       // We cannot throw an exception in the destructor, so this is really best effort
-      JNIEnv *env = nullptr;
-      if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+      JNIEnv* env = nullptr;
+      if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
         env->CallVoidMethod(this->ds, onHostBufferDone_method, id);
       }
     }
   }
 
-private:
-  JavaVM *jvm;
+ private:
+  JavaVM* jvm;
   jobject ds;
   long id;
 };
 
 class jni_datasource : public cudf::io::datasource {
-public:
-  explicit jni_datasource(JNIEnv *env, jobject ds, size_t ds_size, bool device_read_supported,
-                          size_t device_read_cutoff)
-      : ds_size(ds_size), device_read_supported(device_read_supported),
-        device_read_cutoff(device_read_cutoff) {
-    if (env->GetJavaVM(&jvm) < 0) {
-      throw std::runtime_error("GetJavaVM failed");
-    }
+ public:
+  explicit jni_datasource(
+    JNIEnv* env, jobject ds, size_t ds_size, bool device_read_supported, size_t device_read_cutoff)
+    : ds_size(ds_size),
+      device_read_supported(device_read_supported),
+      device_read_cutoff(device_read_cutoff)
+  {
+    if (env->GetJavaVM(&jvm) < 0) { throw std::runtime_error("GetJavaVM failed"); }
     this->ds = add_global_ref(env, ds);
   }
 
-  virtual ~jni_datasource() {
-    JNIEnv *env = nullptr;
-    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+  virtual ~jni_datasource()
+  {
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
       ds = del_global_ref(env, ds);
     }
     ds = nullptr;
   }
 
-  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override {
-    JNIEnv *env = nullptr;
-    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
+  {
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
       throw cudf::jni::jni_exception("Could not load JNIEnv");
     }
 
     jlongArray jbuffer_info =
-        static_cast<jlongArray>(env->CallObjectMethod(this->ds, hostReadBuff_method, offset, size));
-    if (env->ExceptionOccurred()) {
-      throw cudf::jni::jni_exception("Java exception in hostRead");
-    }
+      static_cast<jlongArray>(env->CallObjectMethod(this->ds, hostReadBuff_method, offset, size));
+    if (env->ExceptionOccurred()) { throw cudf::jni::jni_exception("Java exception in hostRead"); }
 
     cudf::jni::native_jlongArray buffer_info(env, jbuffer_info);
-    auto ptr = reinterpret_cast<uint8_t *>(buffer_info[0]);
+    auto ptr      = reinterpret_cast<uint8_t*>(buffer_info[0]);
     size_t length = buffer_info[1];
-    long id = buffer_info[2];
+    long id       = buffer_info[2];
 
     cudf::jni::host_buffer_done_callback cb(this->jvm, this->ds, id);
-    return std::make_unique<owning_buffer<cudf::jni::host_buffer_done_callback>>(std::move(cb), ptr,
-                                                                                 length);
+    return std::make_unique<owning_buffer<cudf::jni::host_buffer_done_callback>>(
+      std::move(cb), ptr, length);
   }
 
-  size_t host_read(size_t offset, size_t size, uint8_t *dst) override {
-    JNIEnv *env = nullptr;
-    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+  size_t host_read(size_t offset, size_t size, uint8_t* dst) override
+  {
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
       throw cudf::jni::jni_exception("Could not load JNIEnv");
     }
 
     jlong amount_read =
-        env->CallLongMethod(this->ds, hostRead_method, offset, size, reinterpret_cast<jlong>(dst));
-    if (env->ExceptionOccurred()) {
-      throw cudf::jni::jni_exception("Java exception in hostRead");
-    }
+      env->CallLongMethod(this->ds, hostRead_method, offset, size, reinterpret_cast<jlong>(dst));
+    if (env->ExceptionOccurred()) { throw cudf::jni::jni_exception("Java exception in hostRead"); }
     return amount_read;
   }
 
@@ -166,28 +157,38 @@ class jni_datasource : public cudf::io::datasource {
 
   bool supports_device_read() const override { return device_read_supported; }
 
-  bool is_device_read_preferred(size_t size) const override {
+  bool is_device_read_preferred(size_t size) const override
+  {
     return device_read_supported && size >= device_read_cutoff;
   }
 
-  size_t device_read(size_t offset, size_t size, uint8_t *dst,
-                     rmm::cuda_stream_view stream) override {
-    JNIEnv *env = nullptr;
-    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+  size_t device_read(size_t offset,
+                     size_t size,
+                     uint8_t* dst,
+                     rmm::cuda_stream_view stream) override
+  {
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
       throw cudf::jni::jni_exception("Could not load JNIEnv");
     }
 
-    jlong amount_read =
-        env->CallLongMethod(this->ds, deviceRead_method, offset, size, reinterpret_cast<jlong>(dst),
-                            reinterpret_cast<jlong>(stream.value()));
+    jlong amount_read = env->CallLongMethod(this->ds,
+                                            deviceRead_method,
+                                            offset,
+                                            size,
+                                            reinterpret_cast<jlong>(dst),
+                                            reinterpret_cast<jlong>(stream.value()));
     if (env->ExceptionOccurred()) {
       throw cudf::jni::jni_exception("Java exception in deviceRead");
     }
     return amount_read;
   }
 
-  std::future<size_t> device_read_async(size_t offset, size_t size, uint8_t *dst,
-                                        rmm::cuda_stream_view stream) override {
+  std::future<size_t> device_read_async(size_t offset,
+                                        size_t size,
+                                        uint8_t* dst,
+                                        rmm::cuda_stream_view stream) override
+  {
     auto amount_read = device_read(offset, size, dst, stream);
     // This is a bit ugly, but we don't have a good way or a need to return
     // a future for the read
@@ -196,42 +197,48 @@ class jni_datasource : public cudf::io::datasource {
     return ret.get_future();
   }
 
-private:
+ private:
   size_t ds_size;
   bool device_read_supported;
   size_t device_read_cutoff;
-  JavaVM *jvm;
+  JavaVM* jvm;
   jobject ds;
 };
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
 
 extern "C" {
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_DataSourceHelper_createWrapperDataSource(
-    JNIEnv *env, jclass, jobject ds, jlong ds_size, jboolean device_read_supported,
-    jlong device_read_cutoff) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_DataSourceHelper_createWrapperDataSource(JNIEnv* env,
+                                                             jclass,
+                                                             jobject ds,
+                                                             jlong ds_size,
+                                                             jboolean device_read_supported,
+                                                             jlong device_read_cutoff)
+{
   JNI_NULL_CHECK(env, ds, "Null data source", 0);
   try {
     cudf::jni::auto_set_device(env);
     auto source =
-        new cudf::jni::jni_datasource(env, ds, ds_size, device_read_supported, device_read_cutoff);
+      new cudf::jni::jni_datasource(env, ds, ds_size, device_read_supported, device_read_cutoff);
     return reinterpret_cast<jlong>(source);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_DataSourceHelper_destroyWrapperDataSource(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_DataSourceHelper_destroyWrapperDataSource(JNIEnv* env,
                                                                                      jclass,
-                                                                                     jlong handle) {
+                                                                                     jlong handle)
+{
   try {
     cudf::jni::auto_set_device(env);
     if (handle != 0) {
-      auto source = reinterpret_cast<cudf::jni::jni_datasource *>(handle);
+      auto source = reinterpret_cast<cudf::jni::jni_datasource*>(handle);
       delete (source);
     }
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/HashJoinJni.cpp b/java/src/main/native/src/HashJoinJni.cpp
index 0f78aef64bc..d4aa08e9a2d 100644
--- a/java/src/main/native/src/HashJoinJni.cpp
+++ b/java/src/main/native/src/HashJoinJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,32 +14,36 @@
  * limitations under the License.
  */
 
-#include <cudf/join.hpp>
-
 #include "cudf_jni_apis.hpp"
 
+#include <cudf/join.hpp>
+
 extern "C" {
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_HashJoin_create(JNIEnv *env, jclass, jlong j_table,
-                                                            jboolean j_nulls_equal) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_HashJoin_create(JNIEnv* env,
+                                                            jclass,
+                                                            jlong j_table,
+                                                            jboolean j_nulls_equal)
+{
   JNI_NULL_CHECK(env, j_table, "table handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto tview = reinterpret_cast<cudf::table_view const *>(j_table);
-    auto nulleq = j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    auto tview         = reinterpret_cast<cudf::table_view const*>(j_table);
+    auto nulleq        = j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
     auto hash_join_ptr = new cudf::hash_join(*tview, nulleq);
     return reinterpret_cast<jlong>(hash_join_ptr);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_HashJoin_destroy(JNIEnv *env, jclass, jlong j_handle) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_HashJoin_destroy(JNIEnv* env, jclass, jlong j_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto hash_join_ptr = reinterpret_cast<cudf::hash_join *>(j_handle);
+    auto hash_join_ptr = reinterpret_cast<cudf::hash_join*>(j_handle);
     delete hash_join_ptr;
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/HostMemoryBufferNativeUtilsJni.cpp b/java/src/main/native/src/HostMemoryBufferNativeUtilsJni.cpp
index f9e05d27798..b175920ab4e 100644
--- a/java/src/main/native/src/HostMemoryBufferNativeUtilsJni.cpp
+++ b/java/src/main/native/src/HostMemoryBufferNativeUtilsJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,40 +14,39 @@
  * limitations under the License.
  */
 
+#include "jni_utils.hpp"
+
 #include <errno.h>
 #include <fcntl.h>
 #include <jni.h>
 #include <string.h>
-#include <unistd.h>
-
 #include <sys/mman.h>
 #include <sys/types.h>
-
-#include "jni_utils.hpp"
+#include <unistd.h>
 
 extern "C" {
 
 JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_HostMemoryBufferNativeUtils_wrapRangeInBuffer(
-    JNIEnv *env, jclass, jlong addr, jlong len) {
-  return env->NewDirectByteBuffer(reinterpret_cast<void *>(addr), len);
+  JNIEnv* env, jclass, jlong addr, jlong len)
+{
+  return env->NewDirectByteBuffer(reinterpret_cast<void*>(addr), len);
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_HostMemoryBufferNativeUtils_mmap(
-    JNIEnv *env, jclass, jstring jpath, jint mode, jlong offset, jlong length) {
+  JNIEnv* env, jclass, jstring jpath, jint mode, jlong offset, jlong length)
+{
   JNI_NULL_CHECK(env, jpath, "path is null", 0);
   JNI_ARG_CHECK(env, (mode == 0 || mode == 1), "bad mode value", 0);
   try {
     cudf::jni::native_jstring path(env, jpath);
 
     int fd = open(path.get(), (mode == 0) ? O_RDONLY : O_RDWR);
-    if (fd == -1) {
-      cudf::jni::throw_java_exception(env, "java/io/IOException", strerror(errno));
-    }
+    if (fd == -1) { cudf::jni::throw_java_exception(env, "java/io/IOException", strerror(errno)); }
 
-    void *address = mmap(NULL, length, (mode == 0) ? PROT_READ : PROT_READ | PROT_WRITE, MAP_SHARED,
-                         fd, offset);
+    void* address =
+      mmap(NULL, length, (mode == 0) ? PROT_READ : PROT_READ | PROT_WRITE, MAP_SHARED, fd, offset);
     if (address == MAP_FAILED) {
-      char const *error_msg = strerror(errno);
+      char const* error_msg = strerror(errno);
       close(fd);
       cudf::jni::throw_java_exception(env, "java/io/IOException", error_msg);
     }
@@ -58,17 +57,17 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_HostMemoryBufferNativeUtils_mmap(
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_HostMemoryBufferNativeUtils_munmap(JNIEnv *env, jclass,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_HostMemoryBufferNativeUtils_munmap(JNIEnv* env,
+                                                                              jclass,
                                                                               jlong address,
-                                                                              jlong length) {
+                                                                              jlong length)
+{
   JNI_NULL_CHECK(env, address, "address is NULL", );
   try {
-    int rc = munmap(reinterpret_cast<void *>(address), length);
-    if (rc == -1) {
-      cudf::jni::throw_java_exception(env, "java/io/IOException", strerror(errno));
-    }
+    int rc = munmap(reinterpret_cast<void*>(address), length);
+    if (rc == -1) { cudf::jni::throw_java_exception(env, "java/io/IOException", strerror(errno)); }
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/NvcompJni.cpp b/java/src/main/native/src/NvcompJni.cpp
index e616b7f66be..47a24653549 100644
--- a/java/src/main/native/src/NvcompJni.cpp
+++ b/java/src/main/native/src/NvcompJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,22 +13,23 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <nvcomp.h>
+#include "check_nvcomp_output_sizes.hpp"
+#include "cudf_jni_apis.hpp"
 
-#include <nvcomp/lz4.h>
 #include <rmm/device_uvector.hpp>
 
-#include "check_nvcomp_output_sizes.hpp"
-#include "cudf_jni_apis.hpp"
+#include <nvcomp.h>
+#include <nvcomp/lz4.h>
 
 namespace {
 
-constexpr char const *NVCOMP_ERROR_CLASS = "ai/rapids/cudf/nvcomp/NvcompException";
-constexpr char const *NVCOMP_CUDA_ERROR_CLASS = "ai/rapids/cudf/nvcomp/NvcompCudaException";
-constexpr char const *ILLEGAL_ARG_CLASS = "java/lang/IllegalArgumentException";
-constexpr char const *UNSUPPORTED_CLASS = "java/lang/UnsupportedOperationException";
+constexpr char const* NVCOMP_ERROR_CLASS      = "ai/rapids/cudf/nvcomp/NvcompException";
+constexpr char const* NVCOMP_CUDA_ERROR_CLASS = "ai/rapids/cudf/nvcomp/NvcompCudaException";
+constexpr char const* ILLEGAL_ARG_CLASS       = "java/lang/IllegalArgumentException";
+constexpr char const* UNSUPPORTED_CLASS       = "java/lang/UnsupportedOperationException";
 
-void check_nvcomp_status(JNIEnv *env, nvcompStatus_t status) {
+void check_nvcomp_status(JNIEnv* env, nvcompStatus_t status)
+{
   switch (status) {
     case nvcompSuccess: break;
     case nvcompErrorInvalidValue:
@@ -52,19 +53,20 @@ void check_nvcomp_status(JNIEnv *env, nvcompStatus_t status) {
   }
 }
 
-} // anonymous namespace
+}  // anonymous namespace
 
 extern "C" {
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressGetTempSize(
-    JNIEnv *env, jclass, jlong j_batch_size, jlong j_max_chunk_size) {
+  JNIEnv* env, jclass, jlong j_batch_size, jlong j_max_chunk_size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto batch_size = static_cast<std::size_t>(j_batch_size);
-    auto max_chunk_size = static_cast<std::size_t>(j_max_chunk_size);
+    auto batch_size       = static_cast<std::size_t>(j_batch_size);
+    auto max_chunk_size   = static_cast<std::size_t>(j_max_chunk_size);
     std::size_t temp_size = 0;
-    auto status = nvcompBatchedLZ4CompressGetTempSize(batch_size, max_chunk_size,
-                                                      nvcompBatchedLZ4DefaultOpts, &temp_size);
+    auto status           = nvcompBatchedLZ4CompressGetTempSize(
+      batch_size, max_chunk_size, nvcompBatchedLZ4DefaultOpts, &temp_size);
     check_nvcomp_status(env, status);
     return static_cast<jlong>(temp_size);
   }
@@ -72,49 +74,68 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressG
 }
 
 JNIEXPORT jlong JNICALL
-Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressGetMaxOutputChunkSize(
-    JNIEnv *env, jclass, jlong j_max_chunk_size) {
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressGetMaxOutputChunkSize(JNIEnv* env,
+                                                                             jclass,
+                                                                             jlong j_max_chunk_size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto max_chunk_size = static_cast<std::size_t>(j_max_chunk_size);
+    auto max_chunk_size         = static_cast<std::size_t>(j_max_chunk_size);
     std::size_t max_output_size = 0;
-    auto status = nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
-        max_chunk_size, nvcompBatchedLZ4DefaultOpts, &max_output_size);
+    auto status                 = nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
+      max_chunk_size, nvcompBatchedLZ4DefaultOpts, &max_output_size);
     check_nvcomp_status(env, status);
     return static_cast<jlong>(max_output_size);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressAsync(
-    JNIEnv *env, jclass, jlong j_in_ptrs, jlong j_in_sizes, jlong j_chunk_size, jlong j_batch_size,
-    jlong j_temp_ptr, jlong j_temp_size, jlong j_out_ptrs, jlong j_compressed_sizes_out_ptr,
-    jlong j_stream) {
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressAsync(JNIEnv* env,
+                                                             jclass,
+                                                             jlong j_in_ptrs,
+                                                             jlong j_in_sizes,
+                                                             jlong j_chunk_size,
+                                                             jlong j_batch_size,
+                                                             jlong j_temp_ptr,
+                                                             jlong j_temp_size,
+                                                             jlong j_out_ptrs,
+                                                             jlong j_compressed_sizes_out_ptr,
+                                                             jlong j_stream)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto in_ptrs = reinterpret_cast<void const *const *>(j_in_ptrs);
-    auto in_sizes = reinterpret_cast<std::size_t const *>(j_in_sizes);
-    auto chunk_size = static_cast<std::size_t>(j_chunk_size);
-    auto batch_size = static_cast<std::size_t>(j_batch_size);
-    auto temp_ptr = reinterpret_cast<void *>(j_temp_ptr);
-    auto temp_size = static_cast<std::size_t>(j_temp_size);
-    auto out_ptrs = reinterpret_cast<void *const *>(j_out_ptrs);
-    auto compressed_out_sizes = reinterpret_cast<std::size_t *>(j_compressed_sizes_out_ptr);
-    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
-    auto status = nvcompBatchedLZ4CompressAsync(in_ptrs, in_sizes, chunk_size, batch_size, temp_ptr,
-                                                temp_size, out_ptrs, compressed_out_sizes,
-                                                nvcompBatchedLZ4DefaultOpts, stream);
+    auto in_ptrs              = reinterpret_cast<void const* const*>(j_in_ptrs);
+    auto in_sizes             = reinterpret_cast<std::size_t const*>(j_in_sizes);
+    auto chunk_size           = static_cast<std::size_t>(j_chunk_size);
+    auto batch_size           = static_cast<std::size_t>(j_batch_size);
+    auto temp_ptr             = reinterpret_cast<void*>(j_temp_ptr);
+    auto temp_size            = static_cast<std::size_t>(j_temp_size);
+    auto out_ptrs             = reinterpret_cast<void* const*>(j_out_ptrs);
+    auto compressed_out_sizes = reinterpret_cast<std::size_t*>(j_compressed_sizes_out_ptr);
+    auto stream               = reinterpret_cast<cudaStream_t>(j_stream);
+    auto status               = nvcompBatchedLZ4CompressAsync(in_ptrs,
+                                                in_sizes,
+                                                chunk_size,
+                                                batch_size,
+                                                temp_ptr,
+                                                temp_size,
+                                                out_ptrs,
+                                                compressed_out_sizes,
+                                                nvcompBatchedLZ4DefaultOpts,
+                                                stream);
     check_nvcomp_status(env, status);
   }
   CATCH_STD(env, );
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressGetTempSize(
-    JNIEnv *env, jclass, jlong j_batch_size, jlong j_chunk_size) {
+  JNIEnv* env, jclass, jlong j_batch_size, jlong j_chunk_size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto batch_size = static_cast<std::size_t>(j_batch_size);
-    auto chunk_size = static_cast<std::size_t>(j_chunk_size);
+    auto batch_size       = static_cast<std::size_t>(j_batch_size);
+    auto chunk_size       = static_cast<std::size_t>(j_chunk_size);
     std::size_t temp_size = 0;
     auto status = nvcompBatchedLZ4DecompressGetTempSize(batch_size, chunk_size, &temp_size);
     check_nvcomp_status(env, status);
@@ -123,49 +144,71 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4Decompres
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressAsync(
-    JNIEnv *env, jclass, jlong j_in_ptrs, jlong j_in_sizes, jlong j_out_sizes, jlong j_batch_size,
-    jlong j_temp_ptr, jlong j_temp_size, jlong j_out_ptrs, jlong j_stream) {
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressAsync(JNIEnv* env,
+                                                               jclass,
+                                                               jlong j_in_ptrs,
+                                                               jlong j_in_sizes,
+                                                               jlong j_out_sizes,
+                                                               jlong j_batch_size,
+                                                               jlong j_temp_ptr,
+                                                               jlong j_temp_size,
+                                                               jlong j_out_ptrs,
+                                                               jlong j_stream)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto compressed_ptrs = reinterpret_cast<void const *const *>(j_in_ptrs);
-    auto compressed_sizes = reinterpret_cast<std::size_t const *>(j_in_sizes);
-    auto uncompressed_sizes = reinterpret_cast<std::size_t const *>(j_out_sizes);
-    auto batch_size = static_cast<std::size_t>(j_batch_size);
-    auto temp_ptr = reinterpret_cast<void *>(j_temp_ptr);
-    auto temp_size = static_cast<std::size_t>(j_temp_size);
-    auto uncompressed_ptrs = reinterpret_cast<void *const *>(j_out_ptrs);
-    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
-    auto uncompressed_statuses = rmm::device_uvector<nvcompStatus_t>(batch_size, stream);
+    auto compressed_ptrs           = reinterpret_cast<void const* const*>(j_in_ptrs);
+    auto compressed_sizes          = reinterpret_cast<std::size_t const*>(j_in_sizes);
+    auto uncompressed_sizes        = reinterpret_cast<std::size_t const*>(j_out_sizes);
+    auto batch_size                = static_cast<std::size_t>(j_batch_size);
+    auto temp_ptr                  = reinterpret_cast<void*>(j_temp_ptr);
+    auto temp_size                 = static_cast<std::size_t>(j_temp_size);
+    auto uncompressed_ptrs         = reinterpret_cast<void* const*>(j_out_ptrs);
+    auto stream                    = reinterpret_cast<cudaStream_t>(j_stream);
+    auto uncompressed_statuses     = rmm::device_uvector<nvcompStatus_t>(batch_size, stream);
     auto actual_uncompressed_sizes = rmm::device_uvector<std::size_t>(batch_size, stream);
-    auto status = nvcompBatchedLZ4DecompressAsync(
-        compressed_ptrs, compressed_sizes, uncompressed_sizes, actual_uncompressed_sizes.data(),
-        batch_size, temp_ptr, temp_size, uncompressed_ptrs, uncompressed_statuses.data(), stream);
+    auto status                    = nvcompBatchedLZ4DecompressAsync(compressed_ptrs,
+                                                  compressed_sizes,
+                                                  uncompressed_sizes,
+                                                  actual_uncompressed_sizes.data(),
+                                                  batch_size,
+                                                  temp_ptr,
+                                                  temp_size,
+                                                  uncompressed_ptrs,
+                                                  uncompressed_statuses.data(),
+                                                  stream);
     check_nvcomp_status(env, status);
-    if (!cudf::java::check_nvcomp_output_sizes(uncompressed_sizes, actual_uncompressed_sizes.data(),
-                                               batch_size, stream)) {
-      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS,
-                                      "nvcomp decompress output size mismatch");
+    if (!cudf::java::check_nvcomp_output_sizes(
+          uncompressed_sizes, actual_uncompressed_sizes.data(), batch_size, stream)) {
+      cudf::jni::throw_java_exception(
+        env, NVCOMP_ERROR_CLASS, "nvcomp decompress output size mismatch");
     }
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4GetDecompressSizeAsync(
-    JNIEnv *env, jclass, jlong j_in_ptrs, jlong j_in_sizes, jlong j_out_sizes, jlong j_batch_size,
-    jlong j_stream) {
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4GetDecompressSizeAsync(JNIEnv* env,
+                                                                      jclass,
+                                                                      jlong j_in_ptrs,
+                                                                      jlong j_in_sizes,
+                                                                      jlong j_out_sizes,
+                                                                      jlong j_batch_size,
+                                                                      jlong j_stream)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto compressed_ptrs = reinterpret_cast<void const *const *>(j_in_ptrs);
-    auto compressed_sizes = reinterpret_cast<std::size_t const *>(j_in_sizes);
-    auto uncompressed_sizes = reinterpret_cast<std::size_t *>(j_out_sizes);
-    auto batch_size = static_cast<std::size_t>(j_batch_size);
-    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
-    auto status = nvcompBatchedLZ4GetDecompressSizeAsync(compressed_ptrs, compressed_sizes,
-                                                         uncompressed_sizes, batch_size, stream);
+    auto compressed_ptrs    = reinterpret_cast<void const* const*>(j_in_ptrs);
+    auto compressed_sizes   = reinterpret_cast<std::size_t const*>(j_in_sizes);
+    auto uncompressed_sizes = reinterpret_cast<std::size_t*>(j_out_sizes);
+    auto batch_size         = static_cast<std::size_t>(j_batch_size);
+    auto stream             = reinterpret_cast<cudaStream_t>(j_stream);
+    auto status             = nvcompBatchedLZ4GetDecompressSizeAsync(
+      compressed_ptrs, compressed_sizes, uncompressed_sizes, batch_size, stream);
     check_nvcomp_status(env, status);
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/NvtxRangeJni.cpp b/java/src/main/native/src/NvtxRangeJni.cpp
index 2529acfb91d..4ba6be31b87 100644
--- a/java/src/main/native/src/NvtxRangeJni.cpp
+++ b/java/src/main/native/src/NvtxRangeJni.cpp
@@ -14,15 +14,18 @@
  * limitations under the License.
  */
 
-#include <nvtx3/nvtx3.hpp>
-
 #include "jni_utils.hpp"
 #include "nvtx_common.hpp"
 
+#include <nvtx3/nvtx3.hpp>
+
 extern "C" {
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxRange_push(JNIEnv *env, jclass clazz, jstring name,
-                                                          jint color_bits) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxRange_push(JNIEnv* env,
+                                                          jclass clazz,
+                                                          jstring name,
+                                                          jint color_bits)
+{
   try {
     cudf::jni::native_jstring range_name(env, name);
     nvtx3::color range_color(static_cast<nvtx3::color::value_type>(color_bits));
@@ -32,11 +35,12 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxRange_push(JNIEnv *env, jclass cl
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxRange_pop(JNIEnv *env, jclass clazz) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxRange_pop(JNIEnv* env, jclass clazz)
+{
   try {
     nvtxDomainRangePop(nvtx3::domain::get<cudf::jni::java_domain>());
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/NvtxUniqueRangeJni.cpp b/java/src/main/native/src/NvtxUniqueRangeJni.cpp
index 924b5a564e6..2ff96f96497 100644
--- a/java/src/main/native/src/NvtxUniqueRangeJni.cpp
+++ b/java/src/main/native/src/NvtxUniqueRangeJni.cpp
@@ -14,28 +14,33 @@
  * limitations under the License.
  */
 
-#include <nvtx3/nvtx3.hpp>
-
 #include "jni_utils.hpp"
 #include "nvtx_common.hpp"
 
+#include <nvtx3/nvtx3.hpp>
+
 extern "C" {
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_NvtxUniqueRange_start(JNIEnv *env, jclass clazz,
-                                                                  jstring name, jint color_bits) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_NvtxUniqueRange_start(JNIEnv* env,
+                                                                  jclass clazz,
+                                                                  jstring name,
+                                                                  jint color_bits)
+{
   try {
     cudf::jni::native_jstring range_name(env, name);
     nvtx3::color range_color(static_cast<nvtx3::color::value_type>(color_bits));
     nvtx3::event_attributes attr{range_color, range_name.get()};
     auto nvtxRangeId =
-        nvtxDomainRangeStartEx(nvtx3::domain::get<cudf::jni::java_domain>(), attr.get());
+      nvtxDomainRangeStartEx(nvtx3::domain::get<cudf::jni::java_domain>(), attr.get());
     return static_cast<jlong>(nvtxRangeId);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxUniqueRange_end(JNIEnv *env, jclass clazz,
-                                                               jlong nvtxRangeId) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxUniqueRange_end(JNIEnv* env,
+                                                               jclass clazz,
+                                                               jlong nvtxRangeId)
+{
   try {
     nvtxDomainRangeEnd(nvtx3::domain::get<cudf::jni::java_domain>(),
                        static_cast<nvtxRangeId_t>(nvtxRangeId));
@@ -43,4 +48,4 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxUniqueRange_end(JNIEnv *env, jcla
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/PackedColumnMetadataJni.cpp b/java/src/main/native/src/PackedColumnMetadataJni.cpp
index 7ec3e1294ce..c7c95558e71 100644
--- a/java/src/main/native/src/PackedColumnMetadataJni.cpp
+++ b/java/src/main/native/src/PackedColumnMetadataJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,23 +19,26 @@
 extern "C" {
 
 JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_PackedColumnMetadata_createMetadataDirectBuffer(
-    JNIEnv *env, jclass, jlong j_metadata_ptr) {
+  JNIEnv* env, jclass, jlong j_metadata_ptr)
+{
   JNI_NULL_CHECK(env, j_metadata_ptr, "metadata is null", nullptr);
   try {
-    auto metadata = reinterpret_cast<std::vector<uint8_t> *>(j_metadata_ptr);
-    return env->NewDirectByteBuffer(const_cast<uint8_t *>(metadata->data()), metadata->size());
+    auto metadata = reinterpret_cast<std::vector<uint8_t>*>(j_metadata_ptr);
+    return env->NewDirectByteBuffer(const_cast<uint8_t*>(metadata->data()), metadata->size());
   }
   CATCH_STD(env, nullptr);
 }
 
-JNIEXPORT void JNICALL
-Java_ai_rapids_cudf_PackedColumnMetadata_closeMetadata(JNIEnv *env, jclass, jlong j_metadata_ptr) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_PackedColumnMetadata_closeMetadata(JNIEnv* env,
+                                                                              jclass,
+                                                                              jlong j_metadata_ptr)
+{
   JNI_NULL_CHECK(env, j_metadata_ptr, "metadata is null", );
   try {
-    auto metadata = reinterpret_cast<std::vector<uint8_t> *>(j_metadata_ptr);
+    auto metadata = reinterpret_cast<std::vector<uint8_t>*>(j_metadata_ptr);
     delete metadata;
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 8d7ac8890cc..68453c924d6 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -14,14 +14,10 @@
  * limitations under the License.
  */
 
-#include <atomic>
-#include <ctime>
-#include <fstream>
-#include <iostream>
-#include <limits>
-#include <mutex>
+#include "cudf_jni_apis.hpp"
 
 #include <cudf/io/memory_resource.hpp>
+
 #include <rmm/aligned.hpp>
 #include <rmm/mr/device/aligned_resource_adaptor.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
@@ -36,7 +32,12 @@
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include "cudf_jni_apis.hpp"
+#include <atomic>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <mutex>
 
 using rmm::mr::device_memory_resource;
 using rmm::mr::logging_resource_adaptor;
@@ -44,14 +45,14 @@ using rmm_pinned_pool_t = rmm::mr::pool_memory_resource<rmm::mr::pinned_host_mem
 
 namespace {
 
-constexpr char const *RMM_EXCEPTION_CLASS = "ai/rapids/cudf/RmmException";
+constexpr char const* RMM_EXCEPTION_CLASS = "ai/rapids/cudf/RmmException";
 
 /**
  * @brief Base class so we can template tracking_resource_adaptor but
  * still hold all instances of it without issues.
  */
 class base_tracking_resource_adaptor : public device_memory_resource {
-public:
+ public:
   virtual std::size_t get_total_allocated() = 0;
 
   virtual std::size_t get_max_total_allocated() = 0;
@@ -71,7 +72,7 @@ class base_tracking_resource_adaptor : public device_memory_resource {
  */
 template <typename Upstream>
 class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
-public:
+ public:
   /**
    * @brief Constructs a new tracking resource adaptor that delegates to
    * `mr` for all allocation operations while tracking the amount of memory
@@ -81,28 +82,32 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
    * @param size_alignment The alignment to which the `mr` resource will
    * round up all memory allocation size requests.
    */
-  tracking_resource_adaptor(Upstream *mr, std::size_t size_alignment)
-      : resource{mr}, size_align{size_alignment} {}
+  tracking_resource_adaptor(Upstream* mr, std::size_t size_alignment)
+    : resource{mr}, size_align{size_alignment}
+  {
+  }
 
-  Upstream *get_wrapped_resource() { return resource; }
+  Upstream* get_wrapped_resource() { return resource; }
 
   std::size_t get_total_allocated() override { return total_allocated.load(); }
 
   std::size_t get_max_total_allocated() override { return max_total_allocated; }
 
-  void reset_scoped_max_total_allocated(std::size_t initial_value) override {
+  void reset_scoped_max_total_allocated(std::size_t initial_value) override
+  {
     std::scoped_lock lock(max_total_allocated_mutex);
-    scoped_allocated = initial_value;
+    scoped_allocated           = initial_value;
     scoped_max_total_allocated = initial_value;
   }
 
-  std::size_t get_scoped_max_total_allocated() override {
+  std::size_t get_scoped_max_total_allocated() override
+  {
     std::scoped_lock lock(max_total_allocated_mutex);
     return scoped_max_total_allocated;
   }
 
-private:
-  Upstream *const resource;
+ private:
+  Upstream* const resource;
   std::size_t const size_align;
   // sum of what is currently allocated
   std::atomic_size_t total_allocated{0};
@@ -120,7 +125,8 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 
   std::mutex max_total_allocated_mutex;
 
-  void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
+  void* do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override
+  {
     // adjust size of allocation based on specified size alignment
     num_bytes = (num_bytes + size_align - 1) / size_align * size_align;
 
@@ -129,13 +135,14 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
       total_allocated += num_bytes;
       scoped_allocated += num_bytes;
       std::scoped_lock lock(max_total_allocated_mutex);
-      max_total_allocated = std::max(total_allocated.load(), max_total_allocated);
+      max_total_allocated        = std::max(total_allocated.load(), max_total_allocated);
       scoped_max_total_allocated = std::max(scoped_allocated.load(), scoped_max_total_allocated);
     }
     return result;
   }
 
-  void do_deallocate(void *p, std::size_t size, rmm::cuda_stream_view stream) override {
+  void do_deallocate(void* p, std::size_t size, rmm::cuda_stream_view stream) override
+  {
     size = (size + size_align - 1) / size_align * size_align;
 
     resource->deallocate(p, size, stream);
@@ -148,8 +155,9 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 };
 
 template <typename Upstream>
-tracking_resource_adaptor<Upstream> *make_tracking_adaptor(Upstream *upstream,
-                                                           std::size_t size_alignment) {
+tracking_resource_adaptor<Upstream>* make_tracking_adaptor(Upstream* upstream,
+                                                           std::size_t size_alignment)
+{
   return new tracking_resource_adaptor<Upstream>{upstream, size_alignment};
 }
 
@@ -158,24 +166,23 @@ tracking_resource_adaptor<Upstream> *make_tracking_adaptor(Upstream *upstream,
  * for most operations but will call Java to handle certain situations (e.g.: allocation failure).
  */
 class java_event_handler_memory_resource : public device_memory_resource {
-public:
-  java_event_handler_memory_resource(JNIEnv *env, jobject jhandler, jlongArray jalloc_thresholds,
+ public:
+  java_event_handler_memory_resource(JNIEnv* env,
+                                     jobject jhandler,
+                                     jlongArray jalloc_thresholds,
                                      jlongArray jdealloc_thresholds,
-                                     device_memory_resource *resource_to_wrap,
-                                     base_tracking_resource_adaptor *tracker)
-      : resource(resource_to_wrap), tracker(tracker) {
-    if (env->GetJavaVM(&jvm) < 0) {
-      throw std::runtime_error("GetJavaVM failed");
-    }
+                                     device_memory_resource* resource_to_wrap,
+                                     base_tracking_resource_adaptor* tracker)
+    : resource(resource_to_wrap), tracker(tracker)
+  {
+    if (env->GetJavaVM(&jvm) < 0) { throw std::runtime_error("GetJavaVM failed"); }
 
     jclass cls = env->GetObjectClass(jhandler);
-    if (cls == nullptr) {
-      throw cudf::jni::jni_exception("class not found");
-    }
+    if (cls == nullptr) { throw cudf::jni::jni_exception("class not found"); }
     on_alloc_fail_method = env->GetMethodID(cls, "onAllocFailure", "(JI)Z");
     if (on_alloc_fail_method == nullptr) {
       use_old_alloc_fail_interface = true;
-      on_alloc_fail_method = env->GetMethodID(cls, "onAllocFailure", "(J)Z");
+      on_alloc_fail_method         = env->GetMethodID(cls, "onAllocFailure", "(J)Z");
       if (on_alloc_fail_method == nullptr) {
         throw cudf::jni::jni_exception("onAllocFailure method");
       }
@@ -197,22 +204,23 @@ class java_event_handler_memory_resource : public device_memory_resource {
     handler_obj = cudf::jni::add_global_ref(env, jhandler);
   }
 
-  virtual ~java_event_handler_memory_resource() {
+  virtual ~java_event_handler_memory_resource()
+  {
     // This should normally be called by a JVM thread. If the JVM environment is missing then this
     // is likely being triggered by the C++ runtime during shutdown. In that case the JVM may
     // already be destroyed and this thread should not try to attach to get an environment.
-    JNIEnv *env = nullptr;
-    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
       handler_obj = cudf::jni::del_global_ref(env, handler_obj);
     }
     handler_obj = nullptr;
   }
 
-  device_memory_resource *get_wrapped_resource() { return resource; }
+  device_memory_resource* get_wrapped_resource() { return resource; }
 
-private:
-  device_memory_resource *const resource;
-  base_tracking_resource_adaptor *const tracker;
+ private:
+  device_memory_resource* const resource;
+  base_tracking_resource_adaptor* const tracker;
   jmethodID on_alloc_fail_method;
   bool use_old_alloc_fail_interface;
   jmethodID on_alloc_threshold_method;
@@ -222,8 +230,10 @@ class java_event_handler_memory_resource : public device_memory_resource {
   std::vector<std::size_t> alloc_thresholds{};
   std::vector<std::size_t> dealloc_thresholds{};
 
-  static void update_thresholds(JNIEnv *env, std::vector<std::size_t> &thresholds,
-                                jlongArray from_java) {
+  static void update_thresholds(JNIEnv* env,
+                                std::vector<std::size_t>& thresholds,
+                                jlongArray from_java)
+  {
     thresholds.clear();
     if (from_java != nullptr) {
       cudf::jni::native_jlongArray jvalues(env, from_java);
@@ -234,17 +244,19 @@ class java_event_handler_memory_resource : public device_memory_resource {
     }
   }
 
-  bool on_alloc_fail(std::size_t num_bytes, int retry_count) {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
+  bool on_alloc_fail(std::size_t num_bytes, int retry_count)
+  {
+    JNIEnv* env     = cudf::jni::get_jni_env(jvm);
     jboolean result = false;
     if (!use_old_alloc_fail_interface) {
-      result =
-          env->CallBooleanMethod(handler_obj, on_alloc_fail_method, static_cast<jlong>(num_bytes),
-                                 static_cast<jint>(retry_count));
+      result = env->CallBooleanMethod(handler_obj,
+                                      on_alloc_fail_method,
+                                      static_cast<jlong>(num_bytes),
+                                      static_cast<jint>(retry_count));
 
     } else {
       result =
-          env->CallBooleanMethod(handler_obj, on_alloc_fail_method, static_cast<jlong>(num_bytes));
+        env->CallBooleanMethod(handler_obj, on_alloc_fail_method, static_cast<jlong>(num_bytes));
     }
     if (env->ExceptionCheck()) {
       throw std::runtime_error("onAllocFailure handler threw an exception");
@@ -252,16 +264,20 @@ class java_event_handler_memory_resource : public device_memory_resource {
     return result;
   }
 
-  void check_for_threshold_callback(std::size_t low, std::size_t high,
-                                    std::vector<std::size_t> const &thresholds,
-                                    jmethodID callback_method, char const *callback_name,
-                                    std::size_t current_total) {
+  void check_for_threshold_callback(std::size_t low,
+                                    std::size_t high,
+                                    std::vector<std::size_t> const& thresholds,
+                                    jmethodID callback_method,
+                                    char const* callback_name,
+                                    std::size_t current_total)
+  {
     if (high >= thresholds.front() && low < thresholds.back()) {
       // could use binary search, but assumption is threshold count is very small
-      auto it = std::find_if(thresholds.begin(), thresholds.end(),
-                             [=](std::size_t t) -> bool { return low < t && high >= t; });
+      auto it = std::find_if(thresholds.begin(), thresholds.end(), [=](std::size_t t) -> bool {
+        return low < t && high >= t;
+      });
       if (it != thresholds.end()) {
-        JNIEnv *env = cudf::jni::get_jni_env(jvm);
+        JNIEnv* env = cudf::jni::get_jni_env(jvm);
         env->CallVoidMethod(handler_obj, callback_method, current_total);
         if (env->ExceptionCheck()) {
           throw std::runtime_error("onAllocThreshold handler threw an exception");
@@ -270,13 +286,14 @@ class java_event_handler_memory_resource : public device_memory_resource {
     }
   }
 
-protected:
-  JavaVM *jvm;
+ protected:
+  JavaVM* jvm;
   jobject handler_obj;
 
-  void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
+  void* do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override
+  {
     std::size_t total_before;
-    void *result;
+    void* result;
     // a non-zero retry_count signifies that the `on_alloc_fail`
     // callback is being invoked while re-attempting an allocation
     // that had previously failed.
@@ -284,20 +301,22 @@ class java_event_handler_memory_resource : public device_memory_resource {
     while (true) {
       try {
         total_before = tracker->get_total_allocated();
-        result = resource->allocate(num_bytes, stream);
+        result       = resource->allocate(num_bytes, stream);
         break;
-      } catch (rmm::out_of_memory const &e) {
-        if (!on_alloc_fail(num_bytes, retry_count++)) {
-          throw;
-        }
+      } catch (rmm::out_of_memory const& e) {
+        if (!on_alloc_fail(num_bytes, retry_count++)) { throw; }
       }
     }
     auto total_after = tracker->get_total_allocated();
 
     try {
-      check_for_threshold_callback(total_before, total_after, alloc_thresholds,
-                                   on_alloc_threshold_method, "onAllocThreshold", total_after);
-    } catch (std::exception const &e) {
+      check_for_threshold_callback(total_before,
+                                   total_after,
+                                   alloc_thresholds,
+                                   on_alloc_threshold_method,
+                                   "onAllocThreshold",
+                                   total_after);
+    } catch (std::exception const& e) {
       // Free the allocation as app will think the exception means the memory was not allocated.
       resource->deallocate(result, num_bytes, stream);
       throw;
@@ -306,33 +325,36 @@ class java_event_handler_memory_resource : public device_memory_resource {
     return result;
   }
 
-  void do_deallocate(void *p, std::size_t size, rmm::cuda_stream_view stream) override {
+  void do_deallocate(void* p, std::size_t size, rmm::cuda_stream_view stream) override
+  {
     auto total_before = tracker->get_total_allocated();
     resource->deallocate(p, size, stream);
     auto total_after = tracker->get_total_allocated();
-    check_for_threshold_callback(total_after, total_before, dealloc_thresholds,
-                                 on_dealloc_threshold_method, "onDeallocThreshold", total_after);
+    check_for_threshold_callback(total_after,
+                                 total_before,
+                                 dealloc_thresholds,
+                                 on_dealloc_threshold_method,
+                                 "onDeallocThreshold",
+                                 total_after);
   }
 };
 
 class java_debug_event_handler_memory_resource final : public java_event_handler_memory_resource {
-public:
-  java_debug_event_handler_memory_resource(JNIEnv *env, jobject jhandler,
+ public:
+  java_debug_event_handler_memory_resource(JNIEnv* env,
+                                           jobject jhandler,
                                            jlongArray jalloc_thresholds,
                                            jlongArray jdealloc_thresholds,
-                                           device_memory_resource *resource_to_wrap,
-                                           base_tracking_resource_adaptor *tracker)
-      : java_event_handler_memory_resource(env, jhandler, jalloc_thresholds, jdealloc_thresholds,
-                                           resource_to_wrap, tracker) {
+                                           device_memory_resource* resource_to_wrap,
+                                           base_tracking_resource_adaptor* tracker)
+    : java_event_handler_memory_resource(
+        env, jhandler, jalloc_thresholds, jdealloc_thresholds, resource_to_wrap, tracker)
+  {
     jclass cls = env->GetObjectClass(jhandler);
-    if (cls == nullptr) {
-      throw cudf::jni::jni_exception("class not found");
-    }
+    if (cls == nullptr) { throw cudf::jni::jni_exception("class not found"); }
 
     on_allocated_method = env->GetMethodID(cls, "onAllocated", "(J)V");
-    if (on_allocated_method == nullptr) {
-      throw cudf::jni::jni_exception("onAllocated method");
-    }
+    if (on_allocated_method == nullptr) { throw cudf::jni::jni_exception("onAllocated method"); }
 
     on_deallocated_method = env->GetMethodID(cls, "onDeallocated", "(J)V");
     if (on_deallocated_method == nullptr) {
@@ -340,36 +362,41 @@ class java_debug_event_handler_memory_resource final : public java_event_handler
     }
   }
 
-private:
+ private:
   jmethodID on_allocated_method;
   jmethodID on_deallocated_method;
 
-  void on_allocated_callback(std::size_t num_bytes, rmm::cuda_stream_view stream) {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
+  void on_allocated_callback(std::size_t num_bytes, rmm::cuda_stream_view stream)
+  {
+    JNIEnv* env = cudf::jni::get_jni_env(jvm);
     env->CallVoidMethod(handler_obj, on_allocated_method, num_bytes);
     if (env->ExceptionCheck()) {
       throw std::runtime_error("onAllocated handler threw an exception");
     }
   }
 
-  void on_deallocated_callback(void *p, std::size_t size, rmm::cuda_stream_view stream) {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
+  void on_deallocated_callback(void* p, std::size_t size, rmm::cuda_stream_view stream)
+  {
+    JNIEnv* env = cudf::jni::get_jni_env(jvm);
     env->CallVoidMethod(handler_obj, on_deallocated_method, size);
   }
 
-  void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
-    void *result = java_event_handler_memory_resource::do_allocate(num_bytes, stream);
+  void* do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override
+  {
+    void* result = java_event_handler_memory_resource::do_allocate(num_bytes, stream);
     on_allocated_callback(num_bytes, stream);
     return result;
   }
 
-  void do_deallocate(void *p, std::size_t size, rmm::cuda_stream_view stream) override {
+  void do_deallocate(void* p, std::size_t size, rmm::cuda_stream_view stream) override
+  {
     java_event_handler_memory_resource::do_deallocate(p, size, stream);
     on_deallocated_callback(p, size, stream);
   }
 };
 
-inline auto &prior_cuio_host_mr() {
+inline auto& prior_cuio_host_mr()
+{
   static rmm::host_async_resource_ref _prior_cuio_host_mr = cudf::io::get_host_memory_resource();
   return _prior_cuio_host_mr;
 }
@@ -384,18 +411,19 @@ inline auto &prior_cuio_host_mr() {
  * Most of this comes directly from `pinned_host_memory_resource` in RMM.
  */
 class pinned_fallback_host_memory_resource {
-private:
-  rmm_pinned_pool_t *_pool;
-  void *pool_begin_;
-  void *pool_end_;
-
-public:
-  pinned_fallback_host_memory_resource(rmm_pinned_pool_t *pool) : _pool(pool) {
+ private:
+  rmm_pinned_pool_t* _pool;
+  void* pool_begin_;
+  void* pool_end_;
+
+ public:
+  pinned_fallback_host_memory_resource(rmm_pinned_pool_t* pool) : _pool(pool)
+  {
     // allocate from the pinned pool the full size to figure out
     // our beginning and end address.
     auto pool_size = pool->pool_size();
-    pool_begin_ = pool->allocate(pool_size);
-    pool_end_ = static_cast<void *>(static_cast<uint8_t *>(pool_begin_) + pool_size);
+    pool_begin_    = pool->allocate(pool_size);
+    pool_end_      = static_cast<void*>(static_cast<uint8_t*>(pool_begin_) + pool_size);
     pool->deallocate(pool_begin_, pool_size);
   }
 
@@ -415,11 +443,12 @@ class pinned_fallback_host_memory_resource {
    *
    * @return Pointer to the newly allocated memory.
    */
-  void *allocate(std::size_t bytes,
-                 [[maybe_unused]] std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) {
+  void* allocate(std::size_t bytes,
+                 [[maybe_unused]] std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
+  {
     try {
       return _pool->allocate(bytes, alignment);
-    } catch (const std::exception &unused) {
+    } catch (const std::exception& unused) {
       // try to allocate using the underlying pinned resource
       return prior_cuio_host_mr().allocate(bytes, alignment);
     }
@@ -436,8 +465,10 @@ class pinned_fallback_host_memory_resource {
    * @param bytes Size of the allocation.
    * @param alignment Alignment in bytes. Default alignment is used if unspecified.
    */
-  void deallocate(void *ptr, std::size_t bytes,
-                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept {
+  void deallocate(void* ptr,
+                  std::size_t bytes,
+                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept
+  {
     if (ptr >= pool_begin_ && ptr <= pool_end_) {
       _pool->deallocate(ptr, bytes, alignment);
     } else {
@@ -459,7 +490,8 @@ class pinned_fallback_host_memory_resource {
    * @param stream CUDA stream on which to perform the allocation (ignored).
    * @return Pointer to the newly allocated memory.
    */
-  void *allocate_async(std::size_t bytes, [[maybe_unused]] cuda::stream_ref stream) {
+  void* allocate_async(std::size_t bytes, [[maybe_unused]] cuda::stream_ref stream)
+  {
     return allocate(bytes);
   }
 
@@ -478,8 +510,10 @@ class pinned_fallback_host_memory_resource {
    * @param stream CUDA stream on which to perform the allocation (ignored).
    * @return Pointer to the newly allocated memory.
    */
-  void *allocate_async(std::size_t bytes, std::size_t alignment,
-                       [[maybe_unused]] cuda::stream_ref stream) {
+  void* allocate_async(std::size_t bytes,
+                       std::size_t alignment,
+                       [[maybe_unused]] cuda::stream_ref stream)
+  {
     return allocate(bytes, alignment);
   }
 
@@ -492,8 +526,10 @@ class pinned_fallback_host_memory_resource {
    * @param bytes Size of the allocation.
    * @param stream CUDA stream on which to perform the deallocation (ignored).
    */
-  void deallocate_async(void *ptr, std::size_t bytes,
-                        [[maybe_unused]] cuda::stream_ref stream) noexcept {
+  void deallocate_async(void* ptr,
+                        std::size_t bytes,
+                        [[maybe_unused]] cuda::stream_ref stream) noexcept
+  {
     return deallocate(ptr, bytes);
   }
 
@@ -508,8 +544,11 @@ class pinned_fallback_host_memory_resource {
    * @param alignment Alignment in bytes.
    * @param stream CUDA stream on which to perform the deallocation (ignored).
    */
-  void deallocate_async(void *ptr, std::size_t bytes, std::size_t alignment,
-                        [[maybe_unused]] cuda::stream_ref stream) noexcept {
+  void deallocate_async(void* ptr,
+                        std::size_t bytes,
+                        std::size_t alignment,
+                        [[maybe_unused]] cuda::stream_ref stream) noexcept
+  {
     return deallocate(ptr, bytes, alignment);
   }
   // NOLINTEND(bugprone-easily-swappable-parameters)
@@ -517,44 +556,49 @@ class pinned_fallback_host_memory_resource {
   /**
    * @briefreturn{true if the specified resource is the same type as this resource.}
    */
-  bool operator==(const pinned_fallback_host_memory_resource &) const { return true; }
+  bool operator==(const pinned_fallback_host_memory_resource&) const { return true; }
 
   /**
    * @briefreturn{true if the specified resource is not the same type as this resource, otherwise
    * false.}
    */
-  bool operator!=(const pinned_fallback_host_memory_resource &) const { return false; }
+  bool operator!=(const pinned_fallback_host_memory_resource&) const { return false; }
 
   /**
    * @brief Enables the `cuda::mr::device_accessible` property
    *
    * This property declares that a `pinned_host_memory_resource` provides device accessible memory
    */
-  friend void get_property(pinned_fallback_host_memory_resource const &,
-                           cuda::mr::device_accessible) noexcept {}
+  friend void get_property(pinned_fallback_host_memory_resource const&,
+                           cuda::mr::device_accessible) noexcept
+  {
+  }
 
   /**
    * @brief Enables the `cuda::mr::host_accessible` property
    *
    * This property declares that a `pinned_host_memory_resource` provides host accessible memory
    */
-  friend void get_property(pinned_fallback_host_memory_resource const &,
-                           cuda::mr::host_accessible) noexcept {}
+  friend void get_property(pinned_fallback_host_memory_resource const&,
+                           cuda::mr::host_accessible) noexcept
+  {
+  }
 };
 
 // carryover from RMM pinned_host_memory_resource
-static_assert(
-    cuda::mr::async_resource_with<pinned_fallback_host_memory_resource, cuda::mr::device_accessible,
-                                  cuda::mr::host_accessible>);
+static_assert(cuda::mr::async_resource_with<pinned_fallback_host_memory_resource,
+                                            cuda::mr::device_accessible,
+                                            cuda::mr::host_accessible>);
 
 // we set this to our fallback resource if we have set it.
 std::unique_ptr<pinned_fallback_host_memory_resource> pinned_fallback_mr;
 
-} // anonymous namespace
+}  // anonymous namespace
 
 extern "C" {
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initDefaultCudaDevice(JNIEnv *env, jclass clazz) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initDefaultCudaDevice(JNIEnv* env, jclass clazz)
+{
   // make sure the CUDA device is setup in the context
   cudaError_t cuda_status = cudaFree(0);
   cudf::jni::jni_cuda_check(env, cuda_status);
@@ -566,66 +610,78 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initDefaultCudaDevice(JNIEnv *env
   cudf::jni::set_cudf_device(device_id);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_cleanupDefaultCudaDevice(JNIEnv *env, jclass clazz) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_cleanupDefaultCudaDevice(JNIEnv* env, jclass clazz)
+{
   cudf::jni::set_cudf_device(cudaInvalidDeviceId);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocInternal(JNIEnv *env, jclass clazz, jlong size,
-                                                              jlong stream) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocInternal(JNIEnv* env,
+                                                              jclass clazz,
+                                                              jlong size,
+                                                              jlong stream)
+{
   try {
     cudf::jni::auto_set_device(env);
     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource();
     auto c_stream = rmm::cuda_stream_view(reinterpret_cast<cudaStream_t>(stream));
-    void *ret = mr.allocate_async(size, rmm::CUDA_ALLOCATION_ALIGNMENT, c_stream);
+    void* ret     = mr.allocate_async(size, rmm::CUDA_ALLOCATION_ALIGNMENT, c_stream);
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_free(JNIEnv *env, jclass clazz, jlong ptr,
-                                                    jlong size, jlong stream) {
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_Rmm_free(JNIEnv* env, jclass clazz, jlong ptr, jlong size, jlong stream)
+{
   try {
     cudf::jni::auto_set_device(env);
     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource();
-    void *cptr = reinterpret_cast<void *>(ptr);
+    void* cptr                        = reinterpret_cast<void*>(ptr);
     auto c_stream = rmm::cuda_stream_view(reinterpret_cast<cudaStream_t>(stream));
     mr.deallocate_async(cptr, size, rmm::CUDA_ALLOCATION_ALIGNMENT, c_stream);
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeDeviceBuffer(JNIEnv *env, jclass clazz,
-                                                                jlong ptr) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeDeviceBuffer(JNIEnv* env,
+                                                                jclass clazz,
+                                                                jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    rmm::device_buffer *cptr = reinterpret_cast<rmm::device_buffer *>(ptr);
+    rmm::device_buffer* cptr = reinterpret_cast<rmm::device_buffer*>(ptr);
     delete cptr;
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocCudaInternal(JNIEnv *env, jclass clazz,
-                                                                  jlong size, jlong stream) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocCudaInternal(JNIEnv* env,
+                                                                  jclass clazz,
+                                                                  jlong size,
+                                                                  jlong stream)
+{
   try {
     cudf::jni::auto_set_device(env);
-    void *ptr{nullptr};
+    void* ptr{nullptr};
     RMM_CUDA_TRY_ALLOC(cudaMalloc(&ptr, size));
     return reinterpret_cast<jlong>(ptr);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeCuda(JNIEnv *env, jclass clazz, jlong ptr,
-                                                        jlong size, jlong stream) {
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_Rmm_freeCuda(JNIEnv* env, jclass clazz, jlong ptr, jlong size, jlong stream)
+{
   try {
     cudf::jni::auto_set_device(env);
-    void *cptr = reinterpret_cast<void *>(ptr);
+    void* cptr = reinterpret_cast<void*>(ptr);
     RMM_ASSERT_CUDA_SUCCESS(cudaFree(cptr));
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaMemoryResource(JNIEnv *env, jclass clazz) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaMemoryResource(JNIEnv* env, jclass clazz)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto ret = new rmm::mr::cuda_memory_resource();
@@ -634,18 +690,20 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaMemoryResource(JNIEnv *en
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseCudaMemoryResource(JNIEnv *env, jclass clazz,
-                                                                         jlong ptr) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseCudaMemoryResource(JNIEnv* env,
+                                                                         jclass clazz,
+                                                                         jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<rmm::mr::cuda_memory_resource *>(ptr);
+    auto mr = reinterpret_cast<rmm::mr::cuda_memory_resource*>(ptr);
     delete mr;
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newManagedMemoryResource(JNIEnv *env,
-                                                                         jclass clazz) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newManagedMemoryResource(JNIEnv* env, jclass clazz)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto ret = new rmm::mr::managed_memory_resource();
@@ -654,70 +712,77 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newManagedMemoryResource(JNIEnv
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseManagedMemoryResource(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseManagedMemoryResource(JNIEnv* env,
                                                                             jclass clazz,
-                                                                            jlong ptr) {
+                                                                            jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<rmm::mr::managed_memory_resource *>(ptr);
+    auto mr = reinterpret_cast<rmm::mr::managed_memory_resource*>(ptr);
     delete mr;
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newPoolMemoryResource(JNIEnv *env, jclass clazz,
-                                                                      jlong child, jlong init,
-                                                                      jlong max) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newPoolMemoryResource(
+  JNIEnv* env, jclass clazz, jlong child, jlong init, jlong max)
+{
   JNI_NULL_CHECK(env, child, "child is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource *>(child);
+    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource*>(child);
     auto ret =
-        new rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>(wrapped, init, max);
+      new rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>(wrapped, init, max);
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releasePoolMemoryResource(JNIEnv *env, jclass clazz,
-                                                                         jlong ptr) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releasePoolMemoryResource(JNIEnv* env,
+                                                                         jclass clazz,
+                                                                         jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto mr =
-        reinterpret_cast<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> *>(ptr);
+      reinterpret_cast<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>*>(ptr);
     delete mr;
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newArenaMemoryResource(JNIEnv *env, jclass clazz,
-                                                                       jlong child, jlong init,
-                                                                       jboolean dump_on_oom) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newArenaMemoryResource(
+  JNIEnv* env, jclass clazz, jlong child, jlong init, jboolean dump_on_oom)
+{
   JNI_NULL_CHECK(env, child, "child is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource *>(child);
-    auto ret = new rmm::mr::arena_memory_resource<rmm::mr::device_memory_resource>(wrapped, init,
-                                                                                   dump_on_oom);
+    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource*>(child);
+    auto ret     = new rmm::mr::arena_memory_resource<rmm::mr::device_memory_resource>(
+      wrapped, init, dump_on_oom);
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseArenaMemoryResource(JNIEnv *env, jclass clazz,
-                                                                          jlong ptr) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseArenaMemoryResource(JNIEnv* env,
+                                                                          jclass clazz,
+                                                                          jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto mr =
-        reinterpret_cast<rmm::mr::arena_memory_resource<rmm::mr::device_memory_resource> *>(ptr);
+      reinterpret_cast<rmm::mr::arena_memory_resource<rmm::mr::device_memory_resource>*>(ptr);
     delete mr;
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(JNIEnv *env,
-                                                                           jclass clazz, jlong init,
-                                                                           jlong release) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(JNIEnv* env,
+                                                                           jclass clazz,
+                                                                           jlong init,
+                                                                           jlong release)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto ret = new rmm::mr::cuda_async_memory_resource(init, release);
@@ -726,71 +791,70 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(JNIEn
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseCudaAsyncMemoryResource(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseCudaAsyncMemoryResource(JNIEnv* env,
                                                                               jclass clazz,
-                                                                              jlong ptr) {
+                                                                              jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<rmm::mr::cuda_async_memory_resource *>(ptr);
+    auto mr = reinterpret_cast<rmm::mr::cuda_async_memory_resource*>(ptr);
     delete mr;
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newLimitingResourceAdaptor(JNIEnv *env,
-                                                                           jclass clazz,
-                                                                           jlong child, jlong limit,
-                                                                           jlong align) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newLimitingResourceAdaptor(
+  JNIEnv* env, jclass clazz, jlong child, jlong limit, jlong align)
+{
   JNI_NULL_CHECK(env, child, "child is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource *>(child);
-    auto ret = new rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource>(
-        wrapped, limit, align);
+    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource*>(child);
+    auto ret     = new rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource>(
+      wrapped, limit, align);
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseLimitingResourceAdaptor(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseLimitingResourceAdaptor(JNIEnv* env,
                                                                               jclass clazz,
-                                                                              jlong ptr) {
+                                                                              jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto mr =
-        reinterpret_cast<rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource> *>(
-            ptr);
+      reinterpret_cast<rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource>*>(ptr);
     delete mr;
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newLoggingResourceAdaptor(JNIEnv *env, jclass clazz,
-                                                                          jlong child, jint type,
-                                                                          jstring jpath,
-                                                                          jboolean auto_flush) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newLoggingResourceAdaptor(
+  JNIEnv* env, jclass clazz, jlong child, jint type, jstring jpath, jboolean auto_flush)
+{
   JNI_NULL_CHECK(env, child, "child is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource *>(child);
+    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource*>(child);
     switch (type) {
-      case 1: // File
+      case 1:  // File
       {
         cudf::jni::native_jstring path(env, jpath);
         auto ret = new logging_resource_adaptor<rmm::mr::device_memory_resource>(
-            wrapped, path.get(), auto_flush);
+          wrapped, path.get(), auto_flush);
         return reinterpret_cast<jlong>(ret);
       }
-      case 2: // stdout
+      case 2:  // stdout
       {
-        auto ret = new logging_resource_adaptor<rmm::mr::device_memory_resource>(wrapped, std::cout,
-                                                                                 auto_flush);
+        auto ret = new logging_resource_adaptor<rmm::mr::device_memory_resource>(
+          wrapped, std::cout, auto_flush);
         return reinterpret_cast<jlong>(ret);
       }
-      case 3: // stderr
+      case 3:  // stderr
       {
-        auto ret = new logging_resource_adaptor<rmm::mr::device_memory_resource>(wrapped, std::cerr,
-                                                                                 auto_flush);
+        auto ret = new logging_resource_adaptor<rmm::mr::device_memory_resource>(
+          wrapped, std::cerr, auto_flush);
         return reinterpret_cast<jlong>(ret);
       }
       default: throw std::logic_error("unsupported logging location type");
@@ -799,108 +863,121 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newLoggingResourceAdaptor(JNIEnv
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseLoggingResourceAdaptor(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseLoggingResourceAdaptor(JNIEnv* env,
                                                                              jclass clazz,
-                                                                             jlong ptr) {
+                                                                             jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto mr =
-        reinterpret_cast<rmm::mr::logging_resource_adaptor<rmm::mr::device_memory_resource> *>(ptr);
+      reinterpret_cast<rmm::mr::logging_resource_adaptor<rmm::mr::device_memory_resource>*>(ptr);
     delete mr;
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newTrackingResourceAdaptor(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newTrackingResourceAdaptor(JNIEnv* env,
                                                                            jclass clazz,
                                                                            jlong child,
-                                                                           jlong align) {
+                                                                           jlong align)
+{
   JNI_NULL_CHECK(env, child, "child is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource *>(child);
-    auto ret = new tracking_resource_adaptor<rmm::mr::device_memory_resource>(wrapped, align);
+    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource*>(child);
+    auto ret     = new tracking_resource_adaptor<rmm::mr::device_memory_resource>(wrapped, align);
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseTrackingResourceAdaptor(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseTrackingResourceAdaptor(JNIEnv* env,
                                                                               jclass clazz,
-                                                                              jlong ptr) {
+                                                                              jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource> *>(ptr);
+    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource>*>(ptr);
     delete mr;
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_nativeGetTotalBytesAllocated(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_nativeGetTotalBytesAllocated(JNIEnv* env,
                                                                              jclass clazz,
-                                                                             jlong ptr) {
+                                                                             jlong ptr)
+{
   JNI_NULL_CHECK(env, ptr, "adaptor is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource> *>(ptr);
+    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource>*>(ptr);
     return mr->get_total_allocated();
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_nativeGetMaxTotalBytesAllocated(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_nativeGetMaxTotalBytesAllocated(JNIEnv* env,
                                                                                 jclass clazz,
-                                                                                jlong ptr) {
+                                                                                jlong ptr)
+{
   JNI_NULL_CHECK(env, ptr, "adaptor is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource> *>(ptr);
+    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource>*>(ptr);
     return mr->get_max_total_allocated();
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_nativeResetScopedMaxTotalBytesAllocated(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_nativeResetScopedMaxTotalBytesAllocated(JNIEnv* env,
                                                                                        jclass clazz,
                                                                                        jlong ptr,
-                                                                                       jlong init) {
+                                                                                       jlong init)
+{
   JNI_NULL_CHECK(env, ptr, "adaptor is null", );
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource> *>(ptr);
+    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource>*>(ptr);
     mr->reset_scoped_max_total_allocated(init);
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_nativeGetScopedMaxTotalBytesAllocated(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_nativeGetScopedMaxTotalBytesAllocated(JNIEnv* env,
                                                                                       jclass clazz,
-                                                                                      jlong ptr) {
+                                                                                      jlong ptr)
+{
   JNI_NULL_CHECK(env, ptr, "adaptor is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource> *>(ptr);
+    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource>*>(ptr);
     return mr->get_scoped_max_total_allocated();
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newEventHandlerResourceAdaptor(
-    JNIEnv *env, jclass, jlong child, jlong tracker, jobject handler_obj,
-    jlongArray jalloc_thresholds, jlongArray jdealloc_thresholds, jboolean enable_debug) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_Rmm_newEventHandlerResourceAdaptor(JNIEnv* env,
+                                                       jclass,
+                                                       jlong child,
+                                                       jlong tracker,
+                                                       jobject handler_obj,
+                                                       jlongArray jalloc_thresholds,
+                                                       jlongArray jdealloc_thresholds,
+                                                       jboolean enable_debug)
+{
   JNI_NULL_CHECK(env, child, "child is null", 0);
   JNI_NULL_CHECK(env, tracker, "tracker is null", 0);
   try {
-    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource *>(child);
-    auto t =
-        reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource> *>(tracker);
+    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource*>(child);
+    auto t = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource>*>(tracker);
     if (enable_debug) {
-      auto ret = new java_debug_event_handler_memory_resource(env, handler_obj, jalloc_thresholds,
-                                                              jdealloc_thresholds, wrapped, t);
+      auto ret = new java_debug_event_handler_memory_resource(
+        env, handler_obj, jalloc_thresholds, jdealloc_thresholds, wrapped, t);
       return reinterpret_cast<jlong>(ret);
     } else {
-      auto ret = new java_event_handler_memory_resource(env, handler_obj, jalloc_thresholds,
-                                                        jdealloc_thresholds, wrapped, t);
+      auto ret = new java_event_handler_memory_resource(
+        env, handler_obj, jalloc_thresholds, jdealloc_thresholds, wrapped, t);
       return reinterpret_cast<jlong>(ret);
     }
   }
@@ -908,34 +985,38 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newEventHandlerResourceAdaptor(
 }
 
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseEventHandlerResourceAdaptor(
-    JNIEnv *env, jclass clazz, jlong ptr, jboolean enable_debug) {
+  JNIEnv* env, jclass clazz, jlong ptr, jboolean enable_debug)
+{
   try {
     cudf::jni::auto_set_device(env);
     if (enable_debug) {
-      auto mr = reinterpret_cast<java_debug_event_handler_memory_resource *>(ptr);
+      auto mr = reinterpret_cast<java_debug_event_handler_memory_resource*>(ptr);
       delete mr;
     } else {
-      auto mr = reinterpret_cast<java_event_handler_memory_resource *>(ptr);
+      auto mr = reinterpret_cast<java_event_handler_memory_resource*>(ptr);
       delete mr;
     }
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCurrentDeviceResourceInternal(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCurrentDeviceResourceInternal(JNIEnv* env,
                                                                                 jclass clazz,
-                                                                                jlong new_handle) {
+                                                                                jlong new_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<rmm::mr::device_memory_resource *>(new_handle);
+    auto mr = reinterpret_cast<rmm::mr::device_memory_resource*>(new_handle);
     rmm::mr::set_current_device_resource(mr);
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newPinnedPoolMemoryResource(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newPinnedPoolMemoryResource(JNIEnv* env,
                                                                             jclass clazz,
-                                                                            jlong init, jlong max) {
+                                                                            jlong init,
+                                                                            jlong max)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto pool = new rmm_pinned_pool_t(new rmm::mr::pinned_host_memory_resource(), init, max);
@@ -944,12 +1025,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newPinnedPoolMemoryResource(JNIE
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCuioPinnedPoolMemoryResource(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCuioPinnedPoolMemoryResource(JNIEnv* env,
                                                                                jclass clazz,
-                                                                               jlong pool_ptr) {
+                                                                               jlong pool_ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto pool = reinterpret_cast<rmm_pinned_pool_t *>(pool_ptr);
+    auto pool = reinterpret_cast<rmm_pinned_pool_t*>(pool_ptr);
     // create a pinned fallback pool that will allocate pinned memory
     // if the regular pinned pool is exhausted
     pinned_fallback_mr.reset(new pinned_fallback_host_memory_resource(pool));
@@ -959,57 +1041,67 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCuioPinnedPoolMemoryResource(J
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releasePinnedPoolMemoryResource(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releasePinnedPoolMemoryResource(JNIEnv* env,
                                                                                jclass clazz,
-                                                                               jlong pool_ptr) {
+                                                                               jlong pool_ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
     // set the cuio host memory resource to what it was before, or the same
     // if we didn't overwrite it with setCuioPinnedPoolMemoryResource
     cudf::io::set_host_memory_resource(prior_cuio_host_mr());
     pinned_fallback_mr.reset();
-    delete reinterpret_cast<rmm_pinned_pool_t *>(pool_ptr);
+    delete reinterpret_cast<rmm_pinned_pool_t*>(pool_ptr);
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocFromPinnedPool(JNIEnv *env, jclass clazz,
-                                                                    jlong pool_ptr, jlong size) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocFromPinnedPool(JNIEnv* env,
+                                                                    jclass clazz,
+                                                                    jlong pool_ptr,
+                                                                    jlong size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto pool = reinterpret_cast<rmm_pinned_pool_t *>(pool_ptr);
-    void *ret = pool->allocate(size);
+    auto pool = reinterpret_cast<rmm_pinned_pool_t*>(pool_ptr);
+    void* ret = pool->allocate(size);
     return reinterpret_cast<jlong>(ret);
-  } catch (const std::exception &unused) { return -1; }
+  } catch (const std::exception& unused) {
+    return -1;
+  }
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeFromPinnedPool(JNIEnv *env, jclass clazz,
-                                                                  jlong pool_ptr, jlong ptr,
-                                                                  jlong size) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeFromPinnedPool(
+  JNIEnv* env, jclass clazz, jlong pool_ptr, jlong ptr, jlong size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto pool = reinterpret_cast<rmm_pinned_pool_t *>(pool_ptr);
-    void *cptr = reinterpret_cast<void *>(ptr);
+    auto pool  = reinterpret_cast<rmm_pinned_pool_t*>(pool_ptr);
+    void* cptr = reinterpret_cast<void*>(ptr);
     pool->deallocate(cptr, size);
   }
   CATCH_STD(env, )
 }
 
 // only for tests
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocFromFallbackPinnedPool(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocFromFallbackPinnedPool(JNIEnv* env,
                                                                             jclass clazz,
-                                                                            jlong size) {
+                                                                            jlong size)
+{
   cudf::jni::auto_set_device(env);
-  void *ret = cudf::io::get_host_memory_resource().allocate(size);
+  void* ret = cudf::io::get_host_memory_resource().allocate(size);
   return reinterpret_cast<jlong>(ret);
 }
 
 // only for tests
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeFromFallbackPinnedPool(JNIEnv *env, jclass clazz,
-                                                                          jlong ptr, jlong size) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeFromFallbackPinnedPool(JNIEnv* env,
+                                                                          jclass clazz,
+                                                                          jlong ptr,
+                                                                          jlong size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    void *cptr = reinterpret_cast<void *>(ptr);
+    void* cptr = reinterpret_cast<void*>(ptr);
     cudf::io::get_host_memory_resource().deallocate(cptr, size);
   }
   CATCH_STD(env, )
diff --git a/java/src/main/native/src/ScalarJni.cpp b/java/src/main/native/src/ScalarJni.cpp
index e47728f6acc..6a1ad1a9f32 100644
--- a/java/src/main/native/src/ScalarJni.cpp
+++ b/java/src/main/native/src/ScalarJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#include "cudf_jni_apis.hpp"
+#include "dtype_utils.hpp"
+
 #include <cudf/binaryop.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
@@ -21,135 +24,149 @@
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/types.hpp>
 
-#include "cudf_jni_apis.hpp"
-#include "dtype_utils.hpp"
-
 using cudf::jni::release_as_jlong;
 
 extern "C" {
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Scalar_closeScalar(JNIEnv *env, jclass,
-                                                              jlong scalar_handle) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Scalar_closeScalar(JNIEnv* env,
+                                                              jclass,
+                                                              jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
-    cudf::scalar *s = reinterpret_cast<cudf::scalar *>(scalar_handle);
+    cudf::scalar* s = reinterpret_cast<cudf::scalar*>(scalar_handle);
     delete s;
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Scalar_isScalarValid(JNIEnv *env, jclass,
-                                                                    jlong scalar_handle) {
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Scalar_isScalarValid(JNIEnv* env,
+                                                                    jclass,
+                                                                    jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
-    cudf::scalar *s = reinterpret_cast<cudf::scalar *>(scalar_handle);
+    cudf::scalar* s = reinterpret_cast<cudf::scalar*>(scalar_handle);
     return static_cast<jboolean>(s->is_valid());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jbyte JNICALL Java_ai_rapids_cudf_Scalar_getByte(JNIEnv *env, jclass,
-                                                           jlong scalar_handle) {
+JNIEXPORT jbyte JNICALL Java_ai_rapids_cudf_Scalar_getByte(JNIEnv* env, jclass, jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
     using ScalarType = cudf::scalar_type_t<int8_t>;
-    auto s = reinterpret_cast<ScalarType *>(scalar_handle);
+    auto s           = reinterpret_cast<ScalarType*>(scalar_handle);
     return static_cast<jbyte>(s->value());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jshort JNICALL Java_ai_rapids_cudf_Scalar_getShort(JNIEnv *env, jclass,
-                                                             jlong scalar_handle) {
+JNIEXPORT jshort JNICALL Java_ai_rapids_cudf_Scalar_getShort(JNIEnv* env,
+                                                             jclass,
+                                                             jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
     using ScalarType = cudf::scalar_type_t<int16_t>;
-    auto s = reinterpret_cast<ScalarType *>(scalar_handle);
+    auto s           = reinterpret_cast<ScalarType*>(scalar_handle);
     return static_cast<jshort>(s->value());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Scalar_getInt(JNIEnv *env, jclass, jlong scalar_handle) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Scalar_getInt(JNIEnv* env, jclass, jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
     using ScalarType = cudf::scalar_type_t<int32_t>;
-    auto s = reinterpret_cast<ScalarType *>(scalar_handle);
+    auto s           = reinterpret_cast<ScalarType*>(scalar_handle);
     return static_cast<jint>(s->value());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_getLong(JNIEnv *env, jclass,
-                                                           jlong scalar_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_getLong(JNIEnv* env, jclass, jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
     using ScalarType = cudf::scalar_type_t<int64_t>;
-    auto s = reinterpret_cast<ScalarType *>(scalar_handle);
+    auto s           = reinterpret_cast<ScalarType*>(scalar_handle);
     return static_cast<jlong>(s->value());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jfloat JNICALL Java_ai_rapids_cudf_Scalar_getFloat(JNIEnv *env, jclass,
-                                                             jlong scalar_handle) {
+JNIEXPORT jfloat JNICALL Java_ai_rapids_cudf_Scalar_getFloat(JNIEnv* env,
+                                                             jclass,
+                                                             jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
     using ScalarType = cudf::scalar_type_t<float>;
-    auto s = reinterpret_cast<ScalarType *>(scalar_handle);
+    auto s           = reinterpret_cast<ScalarType*>(scalar_handle);
     return static_cast<jfloat>(s->value());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jdouble JNICALL Java_ai_rapids_cudf_Scalar_getDouble(JNIEnv *env, jclass,
-                                                               jlong scalar_handle) {
+JNIEXPORT jdouble JNICALL Java_ai_rapids_cudf_Scalar_getDouble(JNIEnv* env,
+                                                               jclass,
+                                                               jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
     using ScalarType = cudf::scalar_type_t<double>;
-    auto s = reinterpret_cast<ScalarType *>(scalar_handle);
+    auto s           = reinterpret_cast<ScalarType*>(scalar_handle);
     return static_cast<jdouble>(s->value());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jbyteArray JNICALL Java_ai_rapids_cudf_Scalar_getBigIntegerBytes(JNIEnv *env, jclass,
-                                                                           jlong scalar_handle) {
+JNIEXPORT jbyteArray JNICALL Java_ai_rapids_cudf_Scalar_getBigIntegerBytes(JNIEnv* env,
+                                                                           jclass,
+                                                                           jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
     using ScalarType = cudf::scalar_type_t<__int128_t>;
-    auto s = reinterpret_cast<ScalarType *>(scalar_handle);
-    auto val = s->value();
-    jbyte const *ptr = reinterpret_cast<jbyte const *>(&val);
+    auto s           = reinterpret_cast<ScalarType*>(scalar_handle);
+    auto val         = s->value();
+    jbyte const* ptr = reinterpret_cast<jbyte const*>(&val);
     cudf::jni::native_jbyteArray jbytes{env, ptr, sizeof(__int128_t)};
     return jbytes.get_jArray();
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jbyteArray JNICALL Java_ai_rapids_cudf_Scalar_getUTF8(JNIEnv *env, jclass,
-                                                                jlong scalar_handle) {
+JNIEXPORT jbyteArray JNICALL Java_ai_rapids_cudf_Scalar_getUTF8(JNIEnv* env,
+                                                                jclass,
+                                                                jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto s = reinterpret_cast<cudf::string_scalar *>(scalar_handle);
+    auto s = reinterpret_cast<cudf::string_scalar*>(scalar_handle);
     std::string val{s->to_string()};
     if (val.size() > 0x7FFFFFFF) {
-      cudf::jni::throw_java_exception(env, "java/lang/IllegalArgumentException",
-                                      "string scalar too large");
+      cudf::jni::throw_java_exception(
+        env, "java/lang/IllegalArgumentException", "string scalar too large");
     }
-    cudf::jni::native_jbyteArray jbytes{env, reinterpret_cast<jbyte const *>(val.data()),
-                                        static_cast<int>(val.size())};
+    cudf::jni::native_jbyteArray jbytes{
+      env, reinterpret_cast<jbyte const*>(val.data()), static_cast<int>(val.size())};
     return jbytes.get_jArray();
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_getListAsColumnView(JNIEnv *env, jclass,
-                                                                       jlong scalar_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_getListAsColumnView(JNIEnv* env,
+                                                                       jclass,
+                                                                       jlong scalar_handle)
+{
   JNI_NULL_CHECK(env, scalar_handle, "scalar handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto s = reinterpret_cast<cudf::list_scalar *>(scalar_handle);
+    auto s = reinterpret_cast<cudf::list_scalar*>(scalar_handle);
     // Creates a column view in heap with the stack one, to let JVM take care of its
     // life cycle.
     return reinterpret_cast<jlong>(new cudf::column_view(s->view()));
@@ -158,12 +175,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_getListAsColumnView(JNIEnv *e
 }
 
 JNIEXPORT jlongArray JNICALL
-Java_ai_rapids_cudf_Scalar_getChildrenFromStructScalar(JNIEnv *env, jclass, jlong scalar_handle) {
+Java_ai_rapids_cudf_Scalar_getChildrenFromStructScalar(JNIEnv* env, jclass, jlong scalar_handle)
+{
   JNI_NULL_CHECK(env, scalar_handle, "scalar handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const auto s = reinterpret_cast<cudf::struct_scalar *>(scalar_handle);
-    const cudf::table_view &table = s->view();
+    const auto s                  = reinterpret_cast<cudf::struct_scalar*>(scalar_handle);
+    const cudf::table_view& table = s->view();
     cudf::jni::native_jpointerArray<cudf::column_view> column_handles(env, table.num_columns());
     for (int i = 0; i < table.num_columns(); i++) {
       column_handles[i] = new cudf::column_view(table.column(i));
@@ -173,215 +191,246 @@ Java_ai_rapids_cudf_Scalar_getChildrenFromStructScalar(JNIEnv *env, jclass, jlon
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeBool8Scalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeBool8Scalar(JNIEnv* env,
+                                                                   jclass,
                                                                    jboolean value,
-                                                                   jboolean is_valid) {
+                                                                   jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::BOOL8));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::BOOL8));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int8_t>;
-      int8_t val = value ? 1 : 0;
-      static_cast<ScalarType *>(s.get())->set_value(val);
+      int8_t val       = value ? 1 : 0;
+      static_cast<ScalarType*>(s.get())->set_value(val);
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt8Scalar(JNIEnv *env, jclass, jbyte value,
-                                                                  jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt8Scalar(JNIEnv* env,
+                                                                  jclass,
+                                                                  jbyte value,
+                                                                  jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT8));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT8));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int8_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<int8_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<int8_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint8Scalar(JNIEnv *env, jclass, jbyte value,
-                                                                   jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint8Scalar(JNIEnv* env,
+                                                                   jclass,
+                                                                   jbyte value,
+                                                                   jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT8));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT8));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<uint8_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<uint8_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<uint8_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt16Scalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt16Scalar(JNIEnv* env,
+                                                                   jclass,
                                                                    jshort value,
-                                                                   jboolean is_valid) {
+                                                                   jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT16));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT16));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int16_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<int16_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<int16_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint16Scalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint16Scalar(JNIEnv* env,
+                                                                    jclass,
                                                                     jshort value,
-                                                                    jboolean is_valid) {
+                                                                    jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT16));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT16));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<uint16_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<uint16_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<uint16_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDurationDaysScalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDurationDaysScalar(JNIEnv* env,
+                                                                          jclass,
                                                                           jint value,
-                                                                          jboolean is_valid) {
+                                                                          jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_duration_scalar(cudf::data_type(cudf::type_id::DURATION_DAYS));
+      cudf::make_duration_scalar(cudf::data_type(cudf::type_id::DURATION_DAYS));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int32_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<int32_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<int32_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt32Scalar(JNIEnv *env, jclass, jint value,
-                                                                   jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt32Scalar(JNIEnv* env,
+                                                                   jclass,
+                                                                   jint value,
+                                                                   jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int32_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<int32_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<int32_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint32Scalar(JNIEnv *env, jclass, jint value,
-                                                                    jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint32Scalar(JNIEnv* env,
+                                                                    jclass,
+                                                                    jint value,
+                                                                    jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT32));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT32));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<uint32_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<uint32_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<uint32_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt64Scalar(JNIEnv *env, jclass, jlong value,
-                                                                   jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt64Scalar(JNIEnv* env,
+                                                                   jclass,
+                                                                   jlong value,
+                                                                   jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT64));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT64));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int64_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<int64_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<int64_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint64Scalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint64Scalar(JNIEnv* env,
+                                                                    jclass,
                                                                     jlong value,
-                                                                    jboolean is_valid) {
+                                                                    jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT64));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT64));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<uint64_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<uint64_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<uint64_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeFloat32Scalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeFloat32Scalar(JNIEnv* env,
+                                                                     jclass,
                                                                      jfloat value,
-                                                                     jboolean is_valid) {
+                                                                     jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::FLOAT32));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::FLOAT32));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<float>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<float>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<float>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeFloat64Scalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeFloat64Scalar(JNIEnv* env,
+                                                                     jclass,
                                                                      jdouble value,
-                                                                     jboolean is_valid) {
+                                                                     jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::FLOAT64));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::FLOAT64));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<double>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<double>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<double>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeStringScalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeStringScalar(JNIEnv* env,
+                                                                    jclass,
                                                                     jbyteArray value,
-                                                                    jboolean is_valid) {
+                                                                    jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::string strval;
     if (is_valid) {
       cudf::jni::native_jbyteArray jbytes{env, value};
-      strval.assign(reinterpret_cast<char *>(jbytes.data()), jbytes.size());
+      strval.assign(reinterpret_cast<char*>(jbytes.data()), jbytes.size());
     }
 
     auto s = new cudf::string_scalar{strval, static_cast<bool>(is_valid)};
@@ -390,117 +439,116 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeStringScalar(JNIEnv *env,
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeTimestampDaysScalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeTimestampDaysScalar(JNIEnv* env,
+                                                                           jclass,
                                                                            jint value,
-                                                                           jboolean is_valid) {
+                                                                           jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_timestamp_scalar(cudf::data_type(cudf::type_id::TIMESTAMP_DAYS));
+      cudf::make_timestamp_scalar(cudf::data_type(cudf::type_id::TIMESTAMP_DAYS));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int32_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<int32_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<int32_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDurationTimeScalar(JNIEnv *env, jclass,
-                                                                          jint jdtype_id,
-                                                                          jlong value,
-                                                                          jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDurationTimeScalar(
+  JNIEnv* env, jclass, jint jdtype_id, jlong value, jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto dtype_id = static_cast<cudf::type_id>(jdtype_id);
+    auto dtype_id                   = static_cast<cudf::type_id>(jdtype_id);
     std::unique_ptr<cudf::scalar> s = cudf::make_duration_scalar(cudf::data_type(dtype_id));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int64_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<int64_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<int64_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeTimestampTimeScalar(JNIEnv *env, jclass,
-                                                                           jint jdtype_id,
-                                                                           jlong value,
-                                                                           jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeTimestampTimeScalar(
+  JNIEnv* env, jclass, jint jdtype_id, jlong value, jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto dtype_id = static_cast<cudf::type_id>(jdtype_id);
+    auto dtype_id                   = static_cast<cudf::type_id>(jdtype_id);
     std::unique_ptr<cudf::scalar> s = cudf::make_timestamp_scalar(cudf::data_type(dtype_id));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int64_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<int64_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<int64_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDecimal32Scalar(JNIEnv *env, jclass,
-                                                                       jint value, jint scale,
-                                                                       jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDecimal32Scalar(
+  JNIEnv* env, jclass, jint value, jint scale, jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto const value_ = static_cast<int32_t>(value);
     auto const scale_ = numeric::scale_type{static_cast<int32_t>(scale)};
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_fixed_point_scalar<numeric::decimal32>(value_, scale_);
+      cudf::make_fixed_point_scalar<numeric::decimal32>(value_, scale_);
     s->set_valid_async(is_valid);
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDecimal64Scalar(JNIEnv *env, jclass,
-                                                                       jlong value, jint scale,
-                                                                       jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDecimal64Scalar(
+  JNIEnv* env, jclass, jlong value, jint scale, jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto const value_ = static_cast<int64_t>(value);
     auto const scale_ = numeric::scale_type{static_cast<int32_t>(scale)};
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_fixed_point_scalar<numeric::decimal64>(value_, scale_);
+      cudf::make_fixed_point_scalar<numeric::decimal64>(value_, scale_);
     s->set_valid_async(is_valid);
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDecimal128Scalar(JNIEnv *env, jclass,
-                                                                        jbyteArray value,
-                                                                        jint scale,
-                                                                        jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDecimal128Scalar(
+  JNIEnv* env, jclass, jbyteArray value, jint scale, jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto const scale_ = numeric::scale_type{static_cast<int32_t>(scale)};
     cudf::jni::native_jbyteArray jbytes{env, value};
-    auto const value_ = reinterpret_cast<__int128_t *>(jbytes.data());
+    auto const value_ = reinterpret_cast<__int128_t*>(jbytes.data());
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_fixed_point_scalar<numeric::decimal128>(*value_, scale_);
+      cudf::make_fixed_point_scalar<numeric::decimal128>(*value_, scale_);
     s->set_valid_async(is_valid);
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_binaryOpSV(JNIEnv *env, jclass, jlong lhs_ptr,
-                                                              jlong rhs_view, jint int_op,
-                                                              jint out_dtype, jint scale) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_binaryOpSV(
+  JNIEnv* env, jclass, jlong lhs_ptr, jlong rhs_view, jint int_op, jint out_dtype, jint scale)
+{
   JNI_NULL_CHECK(env, lhs_ptr, "lhs is null", 0);
   JNI_NULL_CHECK(env, rhs_view, "rhs is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::scalar *lhs = reinterpret_cast<cudf::scalar *>(lhs_ptr);
-    auto rhs = reinterpret_cast<cudf::column_view *>(rhs_view);
+    cudf::scalar* lhs           = reinterpret_cast<cudf::scalar*>(lhs_ptr);
+    auto rhs                    = reinterpret_cast<cudf::column_view*>(rhs_view);
     cudf::data_type n_data_type = cudf::jni::make_data_type(out_dtype, scale);
-    cudf::binary_operator op = static_cast<cudf::binary_operator>(int_op);
+    cudf::binary_operator op    = static_cast<cudf::binary_operator>(int_op);
 
     if (lhs->type().id() == cudf::type_id::STRUCT) {
       auto out = make_fixed_width_column(n_data_type, rhs->size(), cudf::mask_state::UNALLOCATED);
@@ -512,10 +560,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_binaryOpSV(JNIEnv *env, jclas
         out->set_null_mask(std::move(new_mask), new_null_count);
       }
 
-      auto lhs_col = cudf::make_column_from_scalar(*lhs, 1);
+      auto lhs_col  = cudf::make_column_from_scalar(*lhs, 1);
       auto out_view = out->mutable_view();
       cudf::binops::compiled::detail::apply_sorting_struct_binary_op(
-          out_view, lhs_col->view(), *rhs, true, false, op, cudf::get_default_stream());
+        out_view, lhs_col->view(), *rhs, true, false, op, cudf::get_default_stream());
       return release_as_jlong(out);
     }
 
@@ -524,28 +572,32 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_binaryOpSV(JNIEnv *env, jclas
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeListScalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeListScalar(JNIEnv* env,
+                                                                  jclass,
                                                                   jlong view_handle,
-                                                                  jboolean is_valid) {
+                                                                  jboolean is_valid)
+{
   JNI_NULL_CHECK(env, view_handle, "Column view should NOT be null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto col_view = reinterpret_cast<cudf::column_view *>(view_handle);
+    auto col_view = reinterpret_cast<cudf::column_view*>(view_handle);
 
     // Instead of calling the `cudf::empty_like` to create an empty column when `is_valid`
     // is false, always passes the input view to the scalar, to avoid copying the column
     // twice.
     // Let the Java layer make sure the view is empty when `is_valid` is false.
-    cudf::scalar *s = new cudf::list_scalar(*col_view);
+    cudf::scalar* s = new cudf::list_scalar(*col_view);
     s->set_valid_async(is_valid);
     return reinterpret_cast<jlong>(s);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeStructScalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeStructScalar(JNIEnv* env,
+                                                                    jclass,
                                                                     jlongArray handles,
-                                                                    jboolean is_valid) {
+                                                                    jboolean is_valid)
+{
   JNI_NULL_CHECK(env, handles, "native view handles are null", 0)
   try {
     cudf::jni::auto_set_device(env);
@@ -553,24 +605,29 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeStructScalar(JNIEnv *env,
     cudf::jni::native_jpointerArray<cudf::column_view> column_pointers(env, handles);
     std::vector<cudf::column_view> columns;
     columns.reserve(column_pointers.size());
-    std::transform(column_pointers.data(), column_pointers.data() + column_pointers.size(),
-                   std::back_inserter(columns), [](auto const &col_ptr) { return *col_ptr; });
+    std::transform(column_pointers.data(),
+                   column_pointers.data() + column_pointers.size(),
+                   std::back_inserter(columns),
+                   [](auto const& col_ptr) { return *col_ptr; });
     auto s = std::make_unique<cudf::struct_scalar>(
-        cudf::host_span<cudf::column_view const>{columns}, is_valid);
+      cudf::host_span<cudf::column_view const>{columns}, is_valid);
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_repeatString(JNIEnv *env, jclass, jlong handle,
-                                                                jint repeat_times) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_repeatString(JNIEnv* env,
+                                                                jclass,
+                                                                jlong handle,
+                                                                jint repeat_times)
+{
   JNI_NULL_CHECK(env, handle, "scalar handle is null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto const str = *reinterpret_cast<cudf::string_scalar *>(handle);
+    auto const str = *reinterpret_cast<cudf::string_scalar*>(handle);
     return reinterpret_cast<jlong>(cudf::strings::repeat_string(str, repeat_times).release());
   }
   CATCH_STD(env, 0);
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index e8616710217..e411b1d5362 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -13,10 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <algorithm>
+#include "csv_chunked_writer.hpp"
+#include "cudf_jni_apis.hpp"
+#include "dtype_utils.hpp"
+#include "jni_compiled_expr.hpp"
+#include "jni_utils.hpp"
+#include "jni_writer_data_sink.hpp"
 
-#include <arrow/io/api.h>
-#include <arrow/ipc/api.h>
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/concatenate.hpp>
@@ -44,16 +47,16 @@
 #include <cudf/stream_compaction.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+
 #include <thrust/iterator/counting_iterator.h>
 
-#include "csv_chunked_writer.hpp"
-#include "cudf_jni_apis.hpp"
-#include "dtype_utils.hpp"
-#include "jni_compiled_expr.hpp"
-#include "jni_utils.hpp"
-#include "jni_writer_data_sink.hpp"
+#include <arrow/io/api.h>
+#include <arrow/ipc/api.h>
+
+#include <algorithm>
 
 namespace cudf {
 namespace jni {
@@ -67,9 +70,11 @@ namespace jni {
  */
 struct jni_table_writer_handle_base {
   explicit jni_table_writer_handle_base(
-      std::unique_ptr<jni_writer_data_sink> &&sink_,
-      std::shared_ptr<cudf::io::writer_compression_statistics> &&stats_)
-      : sink{std::move(sink_)}, stats{std::move(stats_)} {}
+    std::unique_ptr<jni_writer_data_sink>&& sink_,
+    std::shared_ptr<cudf::io::writer_compression_statistics>&& stats_)
+    : sink{std::move(sink_)}, stats{std::move(stats_)}
+  {
+  }
 
   std::unique_ptr<jni_writer_data_sink> sink;
   std::shared_ptr<cudf::io::writer_compression_statistics> stats;
@@ -77,13 +82,17 @@ struct jni_table_writer_handle_base {
 
 template <typename Writer>
 struct jni_table_writer_handle final : public jni_table_writer_handle_base {
-  explicit jni_table_writer_handle(std::unique_ptr<Writer> &&writer_)
-      : jni_table_writer_handle_base(nullptr, nullptr), writer{std::move(writer_)} {}
+  explicit jni_table_writer_handle(std::unique_ptr<Writer>&& writer_)
+    : jni_table_writer_handle_base(nullptr, nullptr), writer{std::move(writer_)}
+  {
+  }
   explicit jni_table_writer_handle(
-      std::unique_ptr<Writer> &&writer_, std::unique_ptr<jni_writer_data_sink> &&sink_,
-      std::shared_ptr<cudf::io::writer_compression_statistics> &&stats_)
-      : jni_table_writer_handle_base(std::move(sink_), std::move(stats_)),
-        writer{std::move(writer_)} {}
+    std::unique_ptr<Writer>&& writer_,
+    std::unique_ptr<jni_writer_data_sink>&& sink_,
+    std::shared_ptr<cudf::io::writer_compression_statistics>&& stats_)
+    : jni_table_writer_handle_base(std::move(sink_), std::move(stats_)), writer{std::move(writer_)}
+  {
+  }
 
   std::unique_ptr<Writer> writer;
 };
@@ -92,16 +101,20 @@ typedef jni_table_writer_handle<cudf::io::parquet_chunked_writer> native_parquet
 typedef jni_table_writer_handle<cudf::io::orc_chunked_writer> native_orc_writer_handle;
 
 class native_arrow_ipc_writer_handle final {
-public:
-  explicit native_arrow_ipc_writer_handle(const std::vector<std::string> &col_names,
-                                          const std::string &file_name)
-      : initialized(false), column_names(col_names), file_name(file_name) {}
+ public:
+  explicit native_arrow_ipc_writer_handle(const std::vector<std::string>& col_names,
+                                          const std::string& file_name)
+    : initialized(false), column_names(col_names), file_name(file_name)
+  {
+  }
 
-  explicit native_arrow_ipc_writer_handle(const std::vector<std::string> &col_names,
-                                          const std::shared_ptr<arrow::io::OutputStream> &sink)
-      : initialized(false), column_names(col_names), file_name(""), sink(sink) {}
+  explicit native_arrow_ipc_writer_handle(const std::vector<std::string>& col_names,
+                                          const std::shared_ptr<arrow::io::OutputStream>& sink)
+    : initialized(false), column_names(col_names), file_name(""), sink(sink)
+  {
+  }
 
-private:
+ private:
   bool initialized;
   std::vector<std::string> column_names;
   std::vector<cudf::column_metadata> columns_meta;
@@ -109,23 +122,20 @@ class native_arrow_ipc_writer_handle final {
   std::shared_ptr<arrow::io::OutputStream> sink;
   std::shared_ptr<arrow::ipc::RecordBatchWriter> writer;
 
-public:
-  void write(std::shared_ptr<arrow::Table> &arrow_tab, int64_t max_chunk) {
+ public:
+  void write(std::shared_ptr<arrow::Table>& arrow_tab, int64_t max_chunk)
+  {
     if (!initialized) {
       if (!sink) {
         auto tmp_sink = arrow::io::FileOutputStream::Open(file_name);
-        if (!tmp_sink.ok()) {
-          throw std::runtime_error(tmp_sink.status().message());
-        }
+        if (!tmp_sink.ok()) { throw std::runtime_error(tmp_sink.status().message()); }
         sink = *tmp_sink;
       }
 
       // There is an option to have a file writer too, with metadata
       auto tmp_writer = arrow::ipc::MakeStreamWriter(sink, arrow_tab->schema());
-      if (!tmp_writer.ok()) {
-        throw std::runtime_error(tmp_writer.status().message());
-      }
-      writer = *tmp_writer;
+      if (!tmp_writer.ok()) { throw std::runtime_error(tmp_writer.status().message()); }
+      writer      = *tmp_writer;
       initialized = true;
     }
     if (arrow_tab->num_rows() == 0) {
@@ -133,7 +143,7 @@ class native_arrow_ipc_writer_handle final {
       // empty table, so need to write an empty batch explicitly.
       // For more please see https://issues.apache.org/jira/browse/ARROW-17912.
       auto empty_batch = arrow::RecordBatch::MakeEmpty(arrow_tab->schema());
-      auto status = writer->WriteRecordBatch(*(*empty_batch));
+      auto status      = writer->WriteRecordBatch(*(*empty_batch));
       if (!status.ok()) {
         throw std::runtime_error("writer failed to write batch with the following error: " +
                                  status.ToString());
@@ -147,7 +157,8 @@ class native_arrow_ipc_writer_handle final {
     }
   }
 
-  void close() {
+  void close()
+  {
     if (initialized) {
       {
         auto status = writer->Close();
@@ -167,7 +178,8 @@ class native_arrow_ipc_writer_handle final {
     initialized = false;
   }
 
-  std::vector<cudf::column_metadata> get_column_metadata(const cudf::table_view &tview) {
+  std::vector<cudf::column_metadata> get_column_metadata(const cudf::table_view& tview)
+  {
     if (!column_names.empty() && columns_meta.empty()) {
       // Rebuild the structure of column meta according to table schema.
       // All the tables written by this writer should share the same schema,
@@ -187,13 +199,13 @@ class native_arrow_ipc_writer_handle final {
     return columns_meta;
   }
 
-private:
-  cudf::column_metadata build_one_column_meta(const cudf::column_view &cview, size_t &idx,
-                                              const bool consume_name = true) {
+ private:
+  cudf::column_metadata build_one_column_meta(const cudf::column_view& cview,
+                                              size_t& idx,
+                                              const bool consume_name = true)
+  {
     auto col_meta = cudf::column_metadata{};
-    if (consume_name) {
-      col_meta.name = get_column_name(idx++);
-    }
+    if (consume_name) { col_meta.name = get_column_name(idx++); }
     // Process children
     if (cview.type().id() == cudf::type_id::LIST) {
       // list type:
@@ -213,7 +225,8 @@ class native_arrow_ipc_writer_handle final {
     return col_meta;
   }
 
-  std::string &get_column_name(const size_t idx) {
+  std::string& get_column_name(const size_t idx)
+  {
     if (idx < 0 || idx >= column_names.size()) {
       throw cudf::jni::jni_exception("Missing names for columns or nested struct columns");
     }
@@ -222,49 +235,47 @@ class native_arrow_ipc_writer_handle final {
 };
 
 class jni_arrow_output_stream final : public arrow::io::OutputStream {
-public:
-  explicit jni_arrow_output_stream(JNIEnv *env, jobject callback, jobject host_memory_allocator) {
-    if (env->GetJavaVM(&jvm) < 0) {
-      throw std::runtime_error("GetJavaVM failed");
-    }
+ public:
+  explicit jni_arrow_output_stream(JNIEnv* env, jobject callback, jobject host_memory_allocator)
+  {
+    if (env->GetJavaVM(&jvm) < 0) { throw std::runtime_error("GetJavaVM failed"); }
 
     jclass cls = env->GetObjectClass(callback);
-    if (cls == nullptr) {
-      throw cudf::jni::jni_exception("class not found");
-    }
+    if (cls == nullptr) { throw cudf::jni::jni_exception("class not found"); }
 
     handle_buffer_method =
-        env->GetMethodID(cls, "handleBuffer", "(Lai/rapids/cudf/HostMemoryBuffer;J)V");
-    if (handle_buffer_method == nullptr) {
-      throw cudf::jni::jni_exception("handleBuffer method");
-    }
-    this->callback = add_global_ref(env, callback);
+      env->GetMethodID(cls, "handleBuffer", "(Lai/rapids/cudf/HostMemoryBuffer;J)V");
+    if (handle_buffer_method == nullptr) { throw cudf::jni::jni_exception("handleBuffer method"); }
+    this->callback              = add_global_ref(env, callback);
     this->host_memory_allocator = add_global_ref(env, host_memory_allocator);
   }
 
-  virtual ~jni_arrow_output_stream() {
+  virtual ~jni_arrow_output_stream()
+  {
     // This should normally be called by a JVM thread. If the JVM environment is missing then this
     // is likely being triggered by the C++ runtime during shutdown. In that case the JVM may
     // already be destroyed and this thread should not try to attach to get an environment.
-    JNIEnv *env = nullptr;
-    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
-      callback = del_global_ref(env, callback);
-      current_buffer = del_global_ref(env, current_buffer);
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+      callback              = del_global_ref(env, callback);
+      current_buffer        = del_global_ref(env, current_buffer);
       host_memory_allocator = del_global_ref(env, host_memory_allocator);
     }
-    callback = nullptr;
-    current_buffer = nullptr;
+    callback              = nullptr;
+    current_buffer        = nullptr;
     host_memory_allocator = nullptr;
   }
 
-  arrow::Status Write(const std::shared_ptr<arrow::Buffer> &data) override {
+  arrow::Status Write(const std::shared_ptr<arrow::Buffer>& data) override
+  {
     return Write(data->data(), data->size());
   }
 
-  arrow::Status Write(const void *data, int64_t nbytes) override {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
-    int64_t left_to_copy = nbytes;
-    const char *copy_from = static_cast<const char *>(data);
+  arrow::Status Write(const void* data, int64_t nbytes) override
+  {
+    JNIEnv* env           = cudf::jni::get_jni_env(jvm);
+    int64_t left_to_copy  = nbytes;
+    const char* copy_from = static_cast<const char*>(data);
     while (left_to_copy > 0) {
       long buffer_amount_available = current_buffer_len - current_buffer_written;
       if (buffer_amount_available <= 0) {
@@ -273,8 +284,8 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream {
         buffer_amount_available = current_buffer_len - current_buffer_written;
       }
       long amount_to_copy =
-          left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available;
-      char *copy_to = current_buffer_data + current_buffer_written;
+        left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available;
+      char* copy_to = current_buffer_data + current_buffer_written;
 
       std::memcpy(copy_to, copy_from, amount_to_copy);
       copy_from = copy_from + amount_to_copy;
@@ -285,25 +296,28 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream {
     return arrow::Status::OK();
   }
 
-  arrow::Status Flush() override {
+  arrow::Status Flush() override
+  {
     if (current_buffer_written > 0) {
-      JNIEnv *env = cudf::jni::get_jni_env(jvm);
+      JNIEnv* env = cudf::jni::get_jni_env(jvm);
       handle_buffer(env, current_buffer, current_buffer_written);
-      current_buffer = del_global_ref(env, current_buffer);
-      current_buffer_len = 0;
-      current_buffer_data = nullptr;
+      current_buffer         = del_global_ref(env, current_buffer);
+      current_buffer_len     = 0;
+      current_buffer_data    = nullptr;
       current_buffer_written = 0;
     }
     return arrow::Status::OK();
   }
 
-  arrow::Status Close() override {
-    auto ret = Flush();
+  arrow::Status Close() override
+  {
+    auto ret  = Flush();
     is_closed = true;
     return ret;
   }
 
-  arrow::Status Abort() override {
+  arrow::Status Abort() override
+  {
     is_closed = true;
     return arrow::Status::OK();
   }
@@ -312,99 +326,93 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream {
 
   bool closed() const override { return is_closed; }
 
-private:
-  void rotate_buffer(JNIEnv *env) {
-    if (current_buffer != nullptr) {
-      handle_buffer(env, current_buffer, current_buffer_written);
-    }
-    current_buffer = del_global_ref(env, current_buffer);
-    jobject tmp_buffer = allocate_host_buffer(env, alloc_size, true, host_memory_allocator);
-    current_buffer = add_global_ref(env, tmp_buffer);
-    current_buffer_len = get_host_buffer_length(env, current_buffer);
-    current_buffer_data = reinterpret_cast<char *>(get_host_buffer_address(env, current_buffer));
+ private:
+  void rotate_buffer(JNIEnv* env)
+  {
+    if (current_buffer != nullptr) { handle_buffer(env, current_buffer, current_buffer_written); }
+    current_buffer         = del_global_ref(env, current_buffer);
+    jobject tmp_buffer     = allocate_host_buffer(env, alloc_size, true, host_memory_allocator);
+    current_buffer         = add_global_ref(env, tmp_buffer);
+    current_buffer_len     = get_host_buffer_length(env, current_buffer);
+    current_buffer_data    = reinterpret_cast<char*>(get_host_buffer_address(env, current_buffer));
     current_buffer_written = 0;
   }
 
-  void handle_buffer(JNIEnv *env, jobject buffer, jlong len) {
+  void handle_buffer(JNIEnv* env, jobject buffer, jlong len)
+  {
     env->CallVoidMethod(callback, handle_buffer_method, buffer, len);
-    if (env->ExceptionCheck()) {
-      throw std::runtime_error("handleBuffer threw an exception");
-    }
+    if (env->ExceptionCheck()) { throw std::runtime_error("handleBuffer threw an exception"); }
   }
 
-  JavaVM *jvm;
+  JavaVM* jvm;
   jobject callback;
   jmethodID handle_buffer_method;
-  jobject current_buffer = nullptr;
-  char *current_buffer_data = nullptr;
-  long current_buffer_len = 0;
+  jobject current_buffer      = nullptr;
+  char* current_buffer_data   = nullptr;
+  long current_buffer_len     = 0;
   long current_buffer_written = 0;
-  int64_t total_written = 0;
-  long alloc_size = MINIMUM_WRITE_BUFFER_SIZE;
-  bool is_closed = false;
+  int64_t total_written       = 0;
+  long alloc_size             = MINIMUM_WRITE_BUFFER_SIZE;
+  bool is_closed              = false;
   jobject host_memory_allocator;
 };
 
 class jni_arrow_input_stream final : public arrow::io::InputStream {
-public:
-  explicit jni_arrow_input_stream(JNIEnv *env, jobject callback)
-      : mm(arrow::default_cpu_memory_manager()) {
-    if (env->GetJavaVM(&jvm) < 0) {
-      throw std::runtime_error("GetJavaVM failed");
-    }
+ public:
+  explicit jni_arrow_input_stream(JNIEnv* env, jobject callback)
+    : mm(arrow::default_cpu_memory_manager())
+  {
+    if (env->GetJavaVM(&jvm) < 0) { throw std::runtime_error("GetJavaVM failed"); }
 
     jclass cls = env->GetObjectClass(callback);
-    if (cls == nullptr) {
-      throw cudf::jni::jni_exception("class not found");
-    }
+    if (cls == nullptr) { throw cudf::jni::jni_exception("class not found"); }
 
     read_into_method = env->GetMethodID(cls, "readInto", "(JJ)J");
-    if (read_into_method == nullptr) {
-      throw cudf::jni::jni_exception("readInto method");
-    }
+    if (read_into_method == nullptr) { throw cudf::jni::jni_exception("readInto method"); }
 
     this->callback = add_global_ref(env, callback);
   }
 
-  virtual ~jni_arrow_input_stream() {
+  virtual ~jni_arrow_input_stream()
+  {
     // This should normally be called by a JVM thread. If the JVM environment is missing then this
     // is likely being triggered by the C++ runtime during shutdown. In that case the JVM may
     // already be destroyed and this thread should not try to attach to get an environment.
-    JNIEnv *env = nullptr;
-    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
       callback = del_global_ref(env, callback);
     }
     callback = nullptr;
   }
 
-  arrow::Result<int64_t> Read(int64_t nbytes, void *out) override {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
-    jlong ret = read_into(env, ptr_as_jlong(out), nbytes);
+  arrow::Result<int64_t> Read(int64_t nbytes, void* out) override
+  {
+    JNIEnv* env = cudf::jni::get_jni_env(jvm);
+    jlong ret   = read_into(env, ptr_as_jlong(out), nbytes);
     total_read += ret;
     return ret;
   }
 
-  arrow::Result<std::shared_ptr<arrow::Buffer>> Read(int64_t nbytes) override {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
+  arrow::Result<std::shared_ptr<arrow::Buffer>> Read(int64_t nbytes) override
+  {
+    JNIEnv* env = cudf::jni::get_jni_env(jvm);
     arrow::Result<std::shared_ptr<arrow::ResizableBuffer>> tmp_buffer =
-        arrow::AllocateResizableBuffer(nbytes);
-    if (!tmp_buffer.ok()) {
-      return tmp_buffer;
-    }
-    jlong amount_read = read_into(env, ptr_as_jlong((*tmp_buffer)->data()), nbytes);
+      arrow::AllocateResizableBuffer(nbytes);
+    if (!tmp_buffer.ok()) { return tmp_buffer; }
+    jlong amount_read  = read_into(env, ptr_as_jlong((*tmp_buffer)->data()), nbytes);
     arrow::Status stat = (*tmp_buffer)->Resize(amount_read);
-    if (!stat.ok()) {
-      return stat;
-    }
+    if (!stat.ok()) { return stat; }
     return tmp_buffer;
   }
 
-  arrow::Status Close() override {
+  arrow::Status Close() override
+  {
     is_closed = true;
     return arrow::Status::OK();
   }
 
-  arrow::Status Abort() override {
+  arrow::Status Abort() override
+  {
     is_closed = true;
     return arrow::Status::OK();
   }
@@ -413,57 +421,51 @@ class jni_arrow_input_stream final : public arrow::io::InputStream {
 
   bool closed() const override { return is_closed; }
 
-private:
-  jlong read_into(JNIEnv *env, jlong addr, jlong len) {
+ private:
+  jlong read_into(JNIEnv* env, jlong addr, jlong len)
+  {
     jlong ret = env->CallLongMethod(callback, read_into_method, addr, len);
-    if (env->ExceptionCheck()) {
-      throw std::runtime_error("readInto threw an exception");
-    }
+    if (env->ExceptionCheck()) { throw std::runtime_error("readInto threw an exception"); }
     return ret;
   }
 
-  JavaVM *jvm;
+  JavaVM* jvm;
   jobject callback;
   jmethodID read_into_method;
   int64_t total_read = 0;
-  bool is_closed = false;
+  bool is_closed     = false;
   std::vector<uint8_t> tmp_buffer;
   std::shared_ptr<arrow::MemoryManager> mm;
 };
 
 class native_arrow_ipc_reader_handle final {
-public:
-  explicit native_arrow_ipc_reader_handle(const std::string &file_name) {
+ public:
+  explicit native_arrow_ipc_reader_handle(const std::string& file_name)
+  {
     auto tmp_source = arrow::io::ReadableFile::Open(file_name);
-    if (!tmp_source.ok()) {
-      throw std::runtime_error(tmp_source.status().message());
-    }
-    source = *tmp_source;
+    if (!tmp_source.ok()) { throw std::runtime_error(tmp_source.status().message()); }
+    source          = *tmp_source;
     auto tmp_reader = arrow::ipc::RecordBatchStreamReader::Open(source);
-    if (!tmp_reader.ok()) {
-      throw std::runtime_error(tmp_reader.status().message());
-    }
+    if (!tmp_reader.ok()) { throw std::runtime_error(tmp_reader.status().message()); }
     reader = *tmp_reader;
   }
 
   explicit native_arrow_ipc_reader_handle(std::shared_ptr<arrow::io::InputStream> source)
-      : source(source) {
+    : source(source)
+  {
     auto tmp_reader = arrow::ipc::RecordBatchStreamReader::Open(source);
-    if (!tmp_reader.ok()) {
-      throw std::runtime_error(tmp_reader.status().message());
-    }
+    if (!tmp_reader.ok()) { throw std::runtime_error(tmp_reader.status().message()); }
     reader = *tmp_reader;
   }
 
-  std::shared_ptr<arrow::Table> next(int32_t row_target) {
+  std::shared_ptr<arrow::Table> next(int32_t row_target)
+  {
     int64_t total_rows = 0;
-    bool done = false;
+    bool done          = false;
     std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
     while (!done) {
       arrow::Result<std::shared_ptr<arrow::RecordBatch>> batch = reader->Next();
-      if (!batch.ok()) {
-        throw std::runtime_error(batch.status().message());
-      }
+      if (!batch.ok()) { throw std::runtime_error(batch.status().message()); }
       if (!*batch) {
         done = true;
       } else {
@@ -477,17 +479,16 @@ class native_arrow_ipc_reader_handle final {
       return std::unique_ptr<arrow::Table>();
     }
     arrow::Result<std::shared_ptr<arrow::Table>> tmp =
-        arrow::Table::FromRecordBatches(reader->schema(), batches);
-    if (!tmp.ok()) {
-      throw std::runtime_error(tmp.status().message());
-    }
+      arrow::Table::FromRecordBatches(reader->schema(), batches);
+    if (!tmp.ok()) { throw std::runtime_error(tmp.status().message()); }
     return *tmp;
   }
 
   std::shared_ptr<arrow::io::InputStream> source;
   std::shared_ptr<arrow::ipc::RecordBatchReader> reader;
 
-  void close() {
+  void close()
+  {
     auto status = source->Close();
     if (!status.ok()) {
       throw std::runtime_error("Closing source failed with the following error: " +
@@ -496,33 +497,44 @@ class native_arrow_ipc_reader_handle final {
   }
 };
 
-jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &&table_result,
-                                    std::vector<std::unique_ptr<cudf::column>> &&extra_columns) {
+jlongArray convert_table_for_return(JNIEnv* env,
+                                    std::unique_ptr<cudf::table>&& table_result,
+                                    std::vector<std::unique_ptr<cudf::column>>&& extra_columns)
+{
   std::vector<std::unique_ptr<cudf::column>> ret = table_result->release();
-  int table_cols = ret.size();
-  int num_columns = table_cols + extra_columns.size();
+  int table_cols                                 = ret.size();
+  int num_columns                                = table_cols + extra_columns.size();
   cudf::jni::native_jlongArray outcol_handles(env, num_columns);
-  std::transform(ret.begin(), ret.end(), outcol_handles.begin(),
-                 [](auto &col) { return release_as_jlong(col); });
-  std::transform(extra_columns.begin(), extra_columns.end(), outcol_handles.begin() + table_cols,
-                 [](auto &col) { return release_as_jlong(col); });
+  std::transform(ret.begin(), ret.end(), outcol_handles.begin(), [](auto& col) {
+    return release_as_jlong(col);
+  });
+  std::transform(
+    extra_columns.begin(), extra_columns.end(), outcol_handles.begin() + table_cols, [](auto& col) {
+      return release_as_jlong(col);
+    });
   return outcol_handles.get_jArray();
 }
 
-jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &table_result,
-                                    std::vector<std::unique_ptr<cudf::column>> &&extra_columns) {
+jlongArray convert_table_for_return(JNIEnv* env,
+                                    std::unique_ptr<cudf::table>& table_result,
+                                    std::vector<std::unique_ptr<cudf::column>>&& extra_columns)
+{
   return convert_table_for_return(env, std::move(table_result), std::move(extra_columns));
 }
 
-jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &first_table,
-                                    std::unique_ptr<cudf::table> &second_table) {
+jlongArray convert_table_for_return(JNIEnv* env,
+                                    std::unique_ptr<cudf::table>& first_table,
+                                    std::unique_ptr<cudf::table>& second_table)
+{
   return convert_table_for_return(env, first_table, second_table->release());
 }
 
 // Convert the JNI boolean array of key column sort order to a vector of cudf::order
 // for groupby.
-std::vector<cudf::order> resolve_column_order(JNIEnv *env, jbooleanArray jkeys_sort_desc,
-                                              int key_size) {
+std::vector<cudf::order> resolve_column_order(JNIEnv* env,
+                                              jbooleanArray jkeys_sort_desc,
+                                              int key_size)
+{
   cudf::jni::native_jbooleanArray keys_sort_desc(env, jkeys_sort_desc);
   auto keys_sort_num = keys_sort_desc.size();
   // The number of column order should be 0 or equal to the number of key.
@@ -532,18 +544,21 @@ std::vector<cudf::order> resolve_column_order(JNIEnv *env, jbooleanArray jkeys_s
 
   std::vector<cudf::order> column_order(keys_sort_num);
   if (keys_sort_num > 0) {
-    std::transform(keys_sort_desc.data(), keys_sort_desc.data() + keys_sort_num,
-                   column_order.begin(), [](jboolean is_desc) {
-                     return is_desc ? cudf::order::DESCENDING : cudf::order::ASCENDING;
-                   });
+    std::transform(
+      keys_sort_desc.data(),
+      keys_sort_desc.data() + keys_sort_num,
+      column_order.begin(),
+      [](jboolean is_desc) { return is_desc ? cudf::order::DESCENDING : cudf::order::ASCENDING; });
   }
   return column_order;
 }
 
 // Convert the JNI boolean array of key column null order to a vector of cudf::null_order
 // for groupby.
-std::vector<cudf::null_order> resolve_null_precedence(JNIEnv *env, jbooleanArray jkeys_null_first,
-                                                      int key_size) {
+std::vector<cudf::null_order> resolve_null_precedence(JNIEnv* env,
+                                                      jbooleanArray jkeys_null_first,
+                                                      int key_size)
+{
   cudf::jni::native_jbooleanArray keys_null_first(env, jkeys_null_first);
   auto null_order_num = keys_null_first.size();
   // The number of null order should be 0 or equal to the number of key.
@@ -553,8 +568,10 @@ std::vector<cudf::null_order> resolve_null_precedence(JNIEnv *env, jbooleanArray
 
   std::vector<cudf::null_order> null_precedence(null_order_num);
   if (null_order_num > 0) {
-    std::transform(keys_null_first.data(), keys_null_first.data() + null_order_num,
-                   null_precedence.begin(), [](jboolean null_before) {
+    std::transform(keys_null_first.data(),
+                   keys_null_first.data() + null_order_num,
+                   null_precedence.begin(),
+                   [](jboolean null_before) {
                      return null_before ? cudf::null_order::BEFORE : cudf::null_order::AFTER;
                    });
   }
@@ -563,49 +580,63 @@ std::vector<cudf::null_order> resolve_null_precedence(JNIEnv *env, jbooleanArray
 
 namespace {
 
-int set_column_metadata(
-    cudf::io::column_in_metadata &column_metadata, std::vector<std::string> &col_names,
-    cudf::jni::native_jbooleanArray &nullability, cudf::jni::native_jbooleanArray &is_int96,
-    cudf::jni::native_jintArray &precisions, cudf::jni::native_jbooleanArray &is_map,
-    cudf::jni::native_jbooleanArray &hasParquetFieldIds,
-    cudf::jni::native_jintArray &parquetFieldIds, cudf::jni::native_jintArray &children,
-    int num_children, int read_index, cudf::jni::native_jbooleanArray &is_binary) {
+int set_column_metadata(cudf::io::column_in_metadata& column_metadata,
+                        std::vector<std::string>& col_names,
+                        cudf::jni::native_jbooleanArray& nullability,
+                        cudf::jni::native_jbooleanArray& is_int96,
+                        cudf::jni::native_jintArray& precisions,
+                        cudf::jni::native_jbooleanArray& is_map,
+                        cudf::jni::native_jbooleanArray& hasParquetFieldIds,
+                        cudf::jni::native_jintArray& parquetFieldIds,
+                        cudf::jni::native_jintArray& children,
+                        int num_children,
+                        int read_index,
+                        cudf::jni::native_jbooleanArray& is_binary)
+{
   int write_index = 0;
   for (int i = 0; i < num_children; i++, write_index++) {
     cudf::io::column_in_metadata child;
     child.set_name(col_names[read_index]).set_nullability(nullability[read_index]);
-    if (precisions[read_index] > -1) {
-      child.set_decimal_precision(precisions[read_index]);
-    }
-    if (!is_int96.is_null()) {
-      child.set_int96_timestamps(is_int96[read_index]);
-    }
-    if (!is_binary.is_null()) {
-      child.set_output_as_binary(is_binary[read_index]);
-    }
-    if (is_map[read_index]) {
-      child.set_list_column_as_map();
-    }
+    if (precisions[read_index] > -1) { child.set_decimal_precision(precisions[read_index]); }
+    if (!is_int96.is_null()) { child.set_int96_timestamps(is_int96[read_index]); }
+    if (!is_binary.is_null()) { child.set_output_as_binary(is_binary[read_index]); }
+    if (is_map[read_index]) { child.set_list_column_as_map(); }
     if (!parquetFieldIds.is_null() && hasParquetFieldIds[read_index]) {
       child.set_parquet_field_id(parquetFieldIds[read_index]);
     }
     column_metadata.add_child(child);
     int childs_children = children[read_index++];
     if (childs_children > 0) {
-      read_index = set_column_metadata(
-          column_metadata.child(write_index), col_names, nullability, is_int96, precisions, is_map,
-          hasParquetFieldIds, parquetFieldIds, children, childs_children, read_index, is_binary);
+      read_index = set_column_metadata(column_metadata.child(write_index),
+                                       col_names,
+                                       nullability,
+                                       is_int96,
+                                       precisions,
+                                       is_map,
+                                       hasParquetFieldIds,
+                                       parquetFieldIds,
+                                       children,
+                                       childs_children,
+                                       read_index,
+                                       is_binary);
     }
   }
   return read_index;
 }
 
-void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_names,
-                         jintArray &j_children, jbooleanArray &j_col_nullability,
-                         jbooleanArray &j_is_int96, jintArray &j_precisions,
-                         jbooleanArray &j_is_map, cudf::io::table_input_metadata &metadata,
-                         jbooleanArray &j_hasParquetFieldIds, jintArray &j_parquetFieldIds,
-                         jbooleanArray &j_is_binary) {
+void createTableMetaData(JNIEnv* env,
+                         jint num_children,
+                         jobjectArray& j_col_names,
+                         jintArray& j_children,
+                         jbooleanArray& j_col_nullability,
+                         jbooleanArray& j_is_int96,
+                         jintArray& j_precisions,
+                         jbooleanArray& j_is_map,
+                         cudf::io::table_input_metadata& metadata,
+                         jbooleanArray& j_hasParquetFieldIds,
+                         jintArray& j_parquetFieldIds,
+                         jbooleanArray& j_is_binary)
+{
   cudf::jni::auto_set_device(env);
   cudf::jni::native_jstringArray col_names(env, j_col_names);
   cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability);
@@ -622,11 +653,11 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam
   int top_level_children = num_children;
 
   metadata.column_metadata.resize(top_level_children);
-  int read_index = 0; // the read_index, which will be used to read the arrays
+  int read_index = 0;  // the read_index, which will be used to read the arrays
   for (int i = read_index, write_index = 0; i < top_level_children; i++, write_index++) {
     metadata.column_metadata[write_index]
-        .set_name(cpp_names[read_index])
-        .set_nullability(col_nullability[read_index]);
+      .set_name(cpp_names[read_index])
+      .set_nullability(col_nullability[read_index]);
     if (precisions[read_index] > -1) {
       metadata.column_metadata[write_index].set_decimal_precision(precisions[read_index]);
     }
@@ -636,37 +667,46 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam
     if (!is_binary.is_null()) {
       metadata.column_metadata[write_index].set_output_as_binary(is_binary[read_index]);
     }
-    if (is_map[read_index]) {
-      metadata.column_metadata[write_index].set_list_column_as_map();
-    }
+    if (is_map[read_index]) { metadata.column_metadata[write_index].set_list_column_as_map(); }
     if (!parquetFieldIds.is_null() && hasParquetFieldIds[read_index]) {
       metadata.column_metadata[write_index].set_parquet_field_id(parquetFieldIds[read_index]);
     }
     int childs_children = children[read_index++];
     if (childs_children > 0) {
-      read_index =
-          set_column_metadata(metadata.column_metadata[write_index], cpp_names, col_nullability,
-                              is_int96, precisions, is_map, hasParquetFieldIds, parquetFieldIds,
-                              children, childs_children, read_index, is_binary);
+      read_index = set_column_metadata(metadata.column_metadata[write_index],
+                                       cpp_names,
+                                       col_nullability,
+                                       is_int96,
+                                       precisions,
+                                       is_map,
+                                       hasParquetFieldIds,
+                                       parquetFieldIds,
+                                       children,
+                                       childs_children,
+                                       read_index,
+                                       is_binary);
     }
   }
 }
 
 // Check that window parameters are valid.
-bool valid_window_parameters(native_jintArray const &values,
-                             native_jpointerArray<cudf::aggregation> const &ops,
-                             native_jintArray const &min_periods, native_jintArray const &preceding,
-                             native_jintArray const &following) {
+bool valid_window_parameters(native_jintArray const& values,
+                             native_jpointerArray<cudf::aggregation> const& ops,
+                             native_jintArray const& min_periods,
+                             native_jintArray const& preceding,
+                             native_jintArray const& following)
+{
   return values.size() == ops.size() && values.size() == min_periods.size() &&
          values.size() == preceding.size() && values.size() == following.size();
 }
 
 // Check that window parameters are valid.
-bool valid_window_parameters(native_jintArray const &values,
-                             native_jpointerArray<cudf::aggregation> const &ops,
-                             native_jintArray const &min_periods,
-                             native_jpointerArray<cudf::scalar> const &preceding,
-                             native_jpointerArray<cudf::scalar> const &following) {
+bool valid_window_parameters(native_jintArray const& values,
+                             native_jpointerArray<cudf::aggregation> const& ops,
+                             native_jintArray const& min_periods,
+                             native_jpointerArray<cudf::scalar> const& preceding,
+                             native_jpointerArray<cudf::scalar> const& following)
+{
   return values.size() == ops.size() && values.size() == min_periods.size() &&
          values.size() == preceding.size() && values.size() == following.size();
 }
@@ -678,12 +718,13 @@ bool valid_window_parameters(native_jintArray const &values,
 //   2: Host address of the rmm::device_buffer instance that owns the left gather map data
 //   3: Device address of the gather map for the right table
 //   4: Host address of the rmm::device_buffer instance that owns the right gather map data
-jlongArray gather_maps_to_java(JNIEnv *env,
-                               std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
-                                         std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
-                                   maps) {
+jlongArray gather_maps_to_java(
+  JNIEnv* env,
+  std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
+            std::unique_ptr<rmm::device_uvector<cudf::size_type>>> maps)
+{
   // release the underlying device buffer to Java
-  auto left_map_buffer = std::make_unique<rmm::device_buffer>(maps.first->release());
+  auto left_map_buffer  = std::make_unique<rmm::device_buffer>(maps.first->release());
   auto right_map_buffer = std::make_unique<rmm::device_buffer>(maps.second->release());
   cudf::jni::native_jlongArray result(env, 5);
   result[0] = static_cast<jlong>(left_map_buffer->size());
@@ -699,27 +740,29 @@ jlongArray gather_maps_to_java(JNIEnv *env,
 //   0: Size of the gather map in bytes
 //   1: Device address of the gather map
 //   2: Host address of the rmm::device_buffer instance that owns the gather map data
-jlongArray gather_map_to_java(JNIEnv *env,
-                              std::unique_ptr<rmm::device_uvector<cudf::size_type>> map) {
+jlongArray gather_map_to_java(JNIEnv* env,
+                              std::unique_ptr<rmm::device_uvector<cudf::size_type>> map)
+{
   // release the underlying device buffer to Java
   cudf::jni::native_jlongArray result(env, 3);
-  result[0] = static_cast<jlong>(map->size() * sizeof(cudf::size_type));
+  result[0]              = static_cast<jlong>(map->size() * sizeof(cudf::size_type));
   auto gather_map_buffer = std::make_unique<rmm::device_buffer>(map->release());
-  result[1] = ptr_as_jlong(gather_map_buffer->data());
-  result[2] = release_as_jlong(gather_map_buffer);
+  result[1]              = ptr_as_jlong(gather_map_buffer->data());
+  result[2]              = release_as_jlong(gather_map_buffer);
   return result.get_jArray();
 }
 
 // Generate gather maps needed to manifest the result of an equi-join between two tables.
 template <typename T>
-jlongArray join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
-                            jboolean compare_nulls_equal, T join_func) {
+jlongArray join_gather_maps(
+  JNIEnv* env, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal, T join_func)
+{
   JNI_NULL_CHECK(env, j_left_keys, "left_table is null", NULL);
   JNI_NULL_CHECK(env, j_right_keys, "right_table is null", NULL);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
-    auto right_keys = reinterpret_cast<cudf::table_view const *>(j_right_keys);
+    auto left_keys  = reinterpret_cast<cudf::table_view const*>(j_left_keys);
+    auto right_keys = reinterpret_cast<cudf::table_view const*>(j_right_keys);
     auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
     return gather_maps_to_java(env, join_func(*left_keys, *right_keys, nulleq));
   }
@@ -729,14 +772,17 @@ jlongArray join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
 // Generate gather maps needed to manifest the result of an equi-join between a left table and
 // a hash table built from the join's right table.
 template <typename T>
-jlongArray hash_join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_hash_join,
-                                 T join_func) {
+jlongArray hash_join_gather_maps(JNIEnv* env,
+                                 jlong j_left_keys,
+                                 jlong j_right_hash_join,
+                                 T join_func)
+{
   JNI_NULL_CHECK(env, j_left_keys, "left table is null", NULL);
   JNI_NULL_CHECK(env, j_right_hash_join, "hash join is null", NULL);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
-    auto hash_join = reinterpret_cast<cudf::hash_join const *>(j_right_hash_join);
+    auto left_keys = reinterpret_cast<cudf::table_view const*>(j_left_keys);
+    auto hash_join = reinterpret_cast<cudf::hash_join const*>(j_right_hash_join);
     return gather_maps_to_java(env, join_func(*left_keys, *hash_join));
   }
   CATCH_STD(env, NULL);
@@ -744,32 +790,34 @@ jlongArray hash_join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_h
 
 // Generate gather maps needed to manifest the result of a conditional join between two tables.
 template <typename T>
-jlongArray cond_join_gather_maps(JNIEnv *env, jlong j_left_table, jlong j_right_table,
-                                 jlong j_condition, T join_func) {
+jlongArray cond_join_gather_maps(
+  JNIEnv* env, jlong j_left_table, jlong j_right_table, jlong j_condition, T join_func)
+{
   JNI_NULL_CHECK(env, j_left_table, "left_table is null", NULL);
   JNI_NULL_CHECK(env, j_right_table, "right_table is null", NULL);
   JNI_NULL_CHECK(env, j_condition, "condition is null", NULL);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
-    auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
+    auto left_table  = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto right_table = reinterpret_cast<cudf::table_view const*>(j_right_table);
+    auto condition   = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_condition);
     return gather_maps_to_java(
-        env, join_func(*left_table, *right_table, condition->get_top_expression()));
+      env, join_func(*left_table, *right_table, condition->get_top_expression()));
   }
   CATCH_STD(env, NULL);
 }
 
 // Generate a gather map needed to manifest the result of a semi/anti join between two tables.
 template <typename T>
-jlongArray join_gather_single_map(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
-                                  jboolean compare_nulls_equal, T join_func) {
+jlongArray join_gather_single_map(
+  JNIEnv* env, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal, T join_func)
+{
   JNI_NULL_CHECK(env, j_left_keys, "left_table is null", NULL);
   JNI_NULL_CHECK(env, j_right_keys, "right_table is null", NULL);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
-    auto right_keys = reinterpret_cast<cudf::table_view const *>(j_right_keys);
+    auto left_keys  = reinterpret_cast<cudf::table_view const*>(j_left_keys);
+    auto right_keys = reinterpret_cast<cudf::table_view const*>(j_right_keys);
     auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
     return gather_map_to_java(env, join_func(*left_keys, *right_keys, nulleq));
   }
@@ -779,26 +827,33 @@ jlongArray join_gather_single_map(JNIEnv *env, jlong j_left_keys, jlong j_right_
 // Generate a gather map needed to manifest the result of a conditional semi/anti join
 // between two tables.
 template <typename T>
-jlongArray cond_join_gather_single_map(JNIEnv *env, jlong j_left_table, jlong j_right_table,
-                                       jlong j_condition, T join_func) {
+jlongArray cond_join_gather_single_map(
+  JNIEnv* env, jlong j_left_table, jlong j_right_table, jlong j_condition, T join_func)
+{
   JNI_NULL_CHECK(env, j_left_table, "left_table is null", NULL);
   JNI_NULL_CHECK(env, j_right_table, "right_table is null", NULL);
   JNI_NULL_CHECK(env, j_condition, "condition is null", NULL);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
-    auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr *>(j_condition);
+    auto left_table  = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto right_table = reinterpret_cast<cudf::table_view const*>(j_right_table);
+    auto condition   = reinterpret_cast<cudf::jni::ast::compiled_expr*>(j_condition);
     return gather_map_to_java(
-        env, join_func(*left_table, *right_table, condition->get_top_expression()));
+      env, join_func(*left_table, *right_table, condition->get_top_expression()));
   }
   CATCH_STD(env, NULL);
 }
 
 template <typename T>
-jlongArray mixed_join_size(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
-                           jlong j_left_condition, jlong j_right_condition, jlong j_condition,
-                           jboolean j_nulls_equal, T join_size_func) {
+jlongArray mixed_join_size(JNIEnv* env,
+                           jlong j_left_keys,
+                           jlong j_right_keys,
+                           jlong j_left_condition,
+                           jlong j_right_condition,
+                           jlong j_condition,
+                           jboolean j_nulls_equal,
+                           T join_size_func)
+{
   JNI_NULL_CHECK(env, j_left_keys, "left keys table is null", 0);
   JNI_NULL_CHECK(env, j_right_keys, "right keys table is null", 0);
   JNI_NULL_CHECK(env, j_left_condition, "left condition table is null", 0);
@@ -806,16 +861,19 @@ jlongArray mixed_join_size(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
   JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
-    auto const right_keys = reinterpret_cast<cudf::table_view const *>(j_right_keys);
-    auto const left_condition = reinterpret_cast<cudf::table_view const *>(j_left_condition);
-    auto const right_condition = reinterpret_cast<cudf::table_view const *>(j_right_condition);
-    auto const condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
+    auto const left_keys       = reinterpret_cast<cudf::table_view const*>(j_left_keys);
+    auto const right_keys      = reinterpret_cast<cudf::table_view const*>(j_right_keys);
+    auto const left_condition  = reinterpret_cast<cudf::table_view const*>(j_left_condition);
+    auto const right_condition = reinterpret_cast<cudf::table_view const*>(j_right_condition);
+    auto const condition = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_condition);
     auto const nulls_equal =
-        j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
-    auto [join_size, matches_per_row] =
-        join_size_func(*left_keys, *right_keys, *left_condition, *right_condition,
-                       condition->get_top_expression(), nulls_equal);
+      j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    auto [join_size, matches_per_row] = join_size_func(*left_keys,
+                                                       *right_keys,
+                                                       *left_condition,
+                                                       *right_condition,
+                                                       condition->get_top_expression(),
+                                                       nulls_equal);
     if (matches_per_row->size() > std::numeric_limits<cudf::size_type>::max()) {
       throw std::runtime_error("Too many values in device buffer to convert into a column");
     }
@@ -823,17 +881,26 @@ jlongArray mixed_join_size(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
     auto col_data = matches_per_row->release();
     cudf::jni::native_jlongArray result(env, 2);
     result[0] = static_cast<jlong>(join_size);
-    result[1] = ptr_as_jlong(new cudf::column{cudf::data_type{cudf::type_id::INT32}, col_size,
-                                              std::move(col_data), rmm::device_buffer{}, 0});
+    result[1] = ptr_as_jlong(new cudf::column{cudf::data_type{cudf::type_id::INT32},
+                                              col_size,
+                                              std::move(col_data),
+                                              rmm::device_buffer{},
+                                              0});
     return result.get_jArray();
   }
   CATCH_STD(env, NULL);
 }
 
 template <typename T>
-jlongArray mixed_join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
-                                  jlong j_left_condition, jlong j_right_condition,
-                                  jlong j_condition, jboolean j_nulls_equal, T join_func) {
+jlongArray mixed_join_gather_maps(JNIEnv* env,
+                                  jlong j_left_keys,
+                                  jlong j_right_keys,
+                                  jlong j_left_condition,
+                                  jlong j_right_condition,
+                                  jlong j_condition,
+                                  jboolean j_nulls_equal,
+                                  T join_func)
+{
   JNI_NULL_CHECK(env, j_left_keys, "left keys table is null", 0);
   JNI_NULL_CHECK(env, j_right_keys, "right keys table is null", 0);
   JNI_NULL_CHECK(env, j_left_condition, "left condition table is null", 0);
@@ -841,24 +908,34 @@ jlongArray mixed_join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_
   JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
-    auto const right_keys = reinterpret_cast<cudf::table_view const *>(j_right_keys);
-    auto const left_condition = reinterpret_cast<cudf::table_view const *>(j_left_condition);
-    auto const right_condition = reinterpret_cast<cudf::table_view const *>(j_right_condition);
-    auto const condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
+    auto const left_keys       = reinterpret_cast<cudf::table_view const*>(j_left_keys);
+    auto const right_keys      = reinterpret_cast<cudf::table_view const*>(j_right_keys);
+    auto const left_condition  = reinterpret_cast<cudf::table_view const*>(j_left_condition);
+    auto const right_condition = reinterpret_cast<cudf::table_view const*>(j_right_condition);
+    auto const condition = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_condition);
     auto const nulls_equal =
-        j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+      j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
     return gather_maps_to_java(env,
-                               join_func(*left_keys, *right_keys, *left_condition, *right_condition,
-                                         condition->get_top_expression(), nulls_equal));
+                               join_func(*left_keys,
+                                         *right_keys,
+                                         *left_condition,
+                                         *right_condition,
+                                         condition->get_top_expression(),
+                                         nulls_equal));
   }
   CATCH_STD(env, NULL);
 }
 
 template <typename T>
-jlongArray mixed_join_gather_single_map(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
-                                        jlong j_left_condition, jlong j_right_condition,
-                                        jlong j_condition, jboolean j_nulls_equal, T join_func) {
+jlongArray mixed_join_gather_single_map(JNIEnv* env,
+                                        jlong j_left_keys,
+                                        jlong j_right_keys,
+                                        jlong j_left_condition,
+                                        jlong j_right_condition,
+                                        jlong j_condition,
+                                        jboolean j_nulls_equal,
+                                        T join_func)
+{
   JNI_NULL_CHECK(env, j_left_keys, "left keys table is null", 0);
   JNI_NULL_CHECK(env, j_right_keys, "right keys table is null", 0);
   JNI_NULL_CHECK(env, j_left_condition, "left condition table is null", 0);
@@ -866,35 +943,46 @@ jlongArray mixed_join_gather_single_map(JNIEnv *env, jlong j_left_keys, jlong j_
   JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
-    auto const right_keys = reinterpret_cast<cudf::table_view const *>(j_right_keys);
-    auto const left_condition = reinterpret_cast<cudf::table_view const *>(j_left_condition);
-    auto const right_condition = reinterpret_cast<cudf::table_view const *>(j_right_condition);
-    auto const condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
+    auto const left_keys       = reinterpret_cast<cudf::table_view const*>(j_left_keys);
+    auto const right_keys      = reinterpret_cast<cudf::table_view const*>(j_right_keys);
+    auto const left_condition  = reinterpret_cast<cudf::table_view const*>(j_left_condition);
+    auto const right_condition = reinterpret_cast<cudf::table_view const*>(j_right_condition);
+    auto const condition = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_condition);
     auto const nulls_equal =
-        j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+      j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
     return gather_map_to_java(env,
-                              join_func(*left_keys, *right_keys, *left_condition, *right_condition,
-                                        condition->get_top_expression(), nulls_equal));
+                              join_func(*left_keys,
+                                        *right_keys,
+                                        *left_condition,
+                                        *right_condition,
+                                        condition->get_top_expression(),
+                                        nulls_equal));
   }
   CATCH_STD(env, NULL);
 }
 
-std::pair<std::size_t, cudf::device_span<cudf::size_type const>>
-get_mixed_size_info(JNIEnv *env, jlong j_output_row_count, jlong j_matches_view) {
+std::pair<std::size_t, cudf::device_span<cudf::size_type const>> get_mixed_size_info(
+  JNIEnv* env, jlong j_output_row_count, jlong j_matches_view)
+{
   auto const row_count = static_cast<std::size_t>(j_output_row_count);
-  auto const matches = reinterpret_cast<cudf::column_view const *>(j_matches_view);
-  return std::make_pair(row_count, cudf::device_span<cudf::size_type const>(
-                                       matches->template data<cudf::size_type>(), matches->size()));
+  auto const matches   = reinterpret_cast<cudf::column_view const*>(j_matches_view);
+  return std::make_pair(row_count,
+                        cudf::device_span<cudf::size_type const>(
+                          matches->template data<cudf::size_type>(), matches->size()));
 }
 
-cudf::column_view remove_validity_from_col(cudf::column_view column_view) {
+cudf::column_view remove_validity_from_col(cudf::column_view column_view)
+{
   if (!cudf::is_compound(column_view.type())) {
     if (column_view.nullable() && column_view.null_count() == 0) {
       // null_mask is allocated but no nulls present therefore we create a new column_view without
       // the null_mask to avoid things blowing up in reading the parquet file
-      return cudf::column_view(column_view.type(), column_view.size(), column_view.head(), nullptr,
-                               0, column_view.offset());
+      return cudf::column_view(column_view.type(),
+                               column_view.size(),
+                               column_view.head(),
+                               nullptr,
+                               0,
+                               column_view.offset());
     } else {
       return cudf::column_view(column_view);
     }
@@ -905,17 +993,27 @@ cudf::column_view remove_validity_from_col(cudf::column_view column_view) {
       children.push_back(remove_validity_from_col(*it));
     }
     if (!column_view.nullable() || column_view.null_count() != 0) {
-      return cudf::column_view(column_view.type(), column_view.size(), column_view.head(),
-                               column_view.null_mask(), column_view.null_count(),
-                               column_view.offset(), children);
+      return cudf::column_view(column_view.type(),
+                               column_view.size(),
+                               column_view.head(),
+                               column_view.null_mask(),
+                               column_view.null_count(),
+                               column_view.offset(),
+                               children);
     } else {
-      return cudf::column_view(column_view.type(), column_view.size(), column_view.head(), nullptr,
-                               0, column_view.offset(), children);
+      return cudf::column_view(column_view.type(),
+                               column_view.size(),
+                               column_view.head(),
+                               nullptr,
+                               0,
+                               column_view.offset(),
+                               children);
     }
   }
 }
 
-cudf::table_view remove_validity_if_needed(cudf::table_view *input_table_view) {
+cudf::table_view remove_validity_if_needed(cudf::table_view* input_table_view)
+{
   std::vector<cudf::column_view> views;
   views.reserve(input_table_view->num_columns());
   for (auto it = input_table_view->begin(); it != input_table_view->end(); it++) {
@@ -925,11 +1023,12 @@ cudf::table_view remove_validity_if_needed(cudf::table_view *input_table_view) {
   return cudf::table_view(views);
 }
 
-cudf::io::schema_element read_schema_element(int &index,
-                                             cudf::jni::native_jintArray const &children,
-                                             cudf::jni::native_jstringArray const &names,
-                                             cudf::jni::native_jintArray const &types,
-                                             cudf::jni::native_jintArray const &scales) {
+cudf::io::schema_element read_schema_element(int& index,
+                                             cudf::jni::native_jintArray const& children,
+                                             cudf::jni::native_jstringArray const& names,
+                                             cudf::jni::native_jintArray const& types,
+                                             cudf::jni::native_jintArray const& scales)
+{
   auto d_type = cudf::data_type{static_cast<cudf::type_id>(types[index]), scales[index]};
   if (d_type.id() == cudf::type_id::STRUCT || d_type.id() == cudf::type_id::LIST) {
     std::map<std::string, cudf::io::schema_element> child_elems;
@@ -938,8 +1037,8 @@ cudf::io::schema_element read_schema_element(int &index,
     index++;
     for (int i = 0; i < num_children; i++) {
       child_elems.insert(
-          std::pair{names.get(index).get(),
-                    cudf::jni::read_schema_element(index, children, names, types, scales)});
+        std::pair{names.get(index).get(),
+                  cudf::jni::read_schema_element(index, children, names, types, scales)});
     }
     return cudf::io::schema_element{d_type, std::move(child_elems)};
   } else {
@@ -952,26 +1051,27 @@ cudf::io::schema_element read_schema_element(int &index,
   }
 }
 
-void append_flattened_child_counts(cudf::io::column_name_info const &info,
-                                   std::vector<int> &counts) {
+void append_flattened_child_counts(cudf::io::column_name_info const& info, std::vector<int>& counts)
+{
   counts.push_back(info.children.size());
-  for (cudf::io::column_name_info const &child : info.children) {
+  for (cudf::io::column_name_info const& child : info.children) {
     append_flattened_child_counts(child, counts);
   }
 }
 
-void append_flattened_child_names(cudf::io::column_name_info const &info,
-                                  std::vector<std::string> &names) {
+void append_flattened_child_names(cudf::io::column_name_info const& info,
+                                  std::vector<std::string>& names)
+{
   names.push_back(info.name);
-  for (cudf::io::column_name_info const &child : info.children) {
+  for (cudf::io::column_name_info const& child : info.children) {
     append_flattened_child_names(child, names);
   }
 }
 
-} // namespace
+}  // namespace
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
 
 using cudf::jni::convert_table_for_return;
 using cudf::jni::ptr_as_jlong;
@@ -980,24 +1080,28 @@ using cudf::jni::release_as_jlong;
 extern "C" {
 
 // This is a method purely added for testing remove_validity_if_needed method
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_removeNullMasksIfNeeded(JNIEnv *env, jclass,
-                                                                               jlong j_table_view) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_removeNullMasksIfNeeded(JNIEnv* env,
+                                                                               jclass,
+                                                                               jlong j_table_view)
+{
   JNI_NULL_CHECK(env, j_table_view, "table view handle is null", 0);
   try {
-    cudf::table_view *tview = reinterpret_cast<cudf::table_view *>(j_table_view);
+    cudf::table_view* tview = reinterpret_cast<cudf::table_view*>(j_table_view);
     cudf::table_view result = cudf::jni::remove_validity_if_needed(tview);
     cudf::table m_tbl(result);
     std::vector<std::unique_ptr<cudf::column>> cols = m_tbl.release();
     auto results = cudf::jni::native_jlongArray(env, cols.size());
-    std::transform(cols.begin(), cols.end(), results.begin(),
-                   [](auto &col) { return release_as_jlong(col); });
+    std::transform(
+      cols.begin(), cols.end(), results.begin(), [](auto& col) { return release_as_jlong(col); });
     return results.get_jArray();
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_createCudfTableView(JNIEnv *env, jclass,
-                                                                      jlongArray j_cudf_columns) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_createCudfTableView(JNIEnv* env,
+                                                                      jclass,
+                                                                      jlongArray j_cudf_columns)
+{
   JNI_NULL_CHECK(env, j_cudf_columns, "columns are null", 0);
 
   try {
@@ -1010,27 +1114,31 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_createCudfTableView(JNIEnv *en
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_deleteCudfTable(JNIEnv *env, jclass,
-                                                                 jlong j_cudf_table_view) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_deleteCudfTable(JNIEnv* env,
+                                                                 jclass,
+                                                                 jlong j_cudf_table_view)
+{
   JNI_NULL_CHECK(env, j_cudf_table_view, "table view handle is null", );
   try {
     cudf::jni::auto_set_device(env);
-    delete reinterpret_cast<cudf::table_view *>(j_cudf_table_view);
+    delete reinterpret_cast<cudf::table_view*>(j_cudf_table_view);
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_columnViewsFromPacked(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_columnViewsFromPacked(JNIEnv* env,
+                                                                             jclass,
                                                                              jobject buffer_obj,
-                                                                             jlong j_data_address) {
+                                                                             jlong j_data_address)
+{
   // The GPU data address can be null when the table is empty, so it is not null-checked here.
   JNI_NULL_CHECK(env, buffer_obj, "metadata is null", nullptr);
   try {
     cudf::jni::auto_set_device(env);
-    void const *metadata_address = env->GetDirectBufferAddress(buffer_obj);
+    void const* metadata_address = env->GetDirectBufferAddress(buffer_obj);
     JNI_NULL_CHECK(env, metadata_address, "metadata buffer address is null", nullptr);
-    cudf::table_view table = cudf::unpack(static_cast<uint8_t const *>(metadata_address),
-                                          reinterpret_cast<uint8_t const *>(j_data_address));
+    cudf::table_view table = cudf::unpack(static_cast<uint8_t const*>(metadata_address),
+                                          reinterpret_cast<uint8_t const*>(j_data_address));
     cudf::jni::native_jlongArray views(env, table.num_columns());
     for (int i = 0; i < table.num_columns(); i++) {
       // TODO Exception handling is not ideal, if no exceptions are thrown ownership of the new cv
@@ -1051,12 +1159,13 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_columnViewsFromPacked(JNI
   CATCH_STD(env, nullptr);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_sortOrder(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_sortOrder(JNIEnv* env,
+                                                            jclass,
                                                             jlong j_input_table,
                                                             jlongArray j_sort_keys_columns,
                                                             jbooleanArray j_is_descending,
-                                                            jbooleanArray j_are_nulls_smallest) {
-
+                                                            jbooleanArray j_are_nulls_smallest)
+{
   // input validations & verifications
   JNI_NULL_CHECK(env, j_input_table, "input table is null", 0);
   JNI_NULL_CHECK(env, j_sort_keys_columns, "sort keys columns is null", 0);
@@ -1071,19 +1180,21 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_sortOrder(JNIEnv *env, jclass,
     const cudf::jni::native_jbooleanArray n_is_descending(env, j_is_descending);
     jsize num_columns_is_desc = n_is_descending.size();
 
-    JNI_ARG_CHECK(env, num_columns_is_desc == num_columns,
-                  "columns and is_descending lengths don't match", 0);
+    JNI_ARG_CHECK(
+      env, num_columns_is_desc == num_columns, "columns and is_descending lengths don't match", 0);
 
     const cudf::jni::native_jbooleanArray n_are_nulls_smallest(env, j_are_nulls_smallest);
     jsize num_columns_null_smallest = n_are_nulls_smallest.size();
 
-    JNI_ARG_CHECK(env, num_columns_null_smallest == num_columns,
-                  "columns and is_descending lengths don't match", 0);
+    JNI_ARG_CHECK(env,
+                  num_columns_null_smallest == num_columns,
+                  "columns and is_descending lengths don't match",
+                  0);
 
     std::vector<cudf::order> order =
-        n_is_descending.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING);
+      n_is_descending.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING);
     std::vector<cudf::null_order> null_order =
-        n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER);
+      n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER);
 
     std::vector<cudf::column_view> sort_keys = n_sort_keys_columns.get_dereferenced();
     return release_as_jlong(cudf::sorted_order(cudf::table_view{sort_keys}, order, null_order));
@@ -1091,12 +1202,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_sortOrder(JNIEnv *env, jclass,
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_orderBy(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_orderBy(JNIEnv* env,
+                                                               jclass,
                                                                jlong j_input_table,
                                                                jlongArray j_sort_keys_columns,
                                                                jbooleanArray j_is_descending,
-                                                               jbooleanArray j_are_nulls_smallest) {
-
+                                                               jbooleanArray j_are_nulls_smallest)
+{
   // input validations & verifications
   JNI_NULL_CHECK(env, j_input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, j_sort_keys_columns, "sort keys columns is null", NULL);
@@ -1111,36 +1223,39 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_orderBy(JNIEnv *env, jcla
     const cudf::jni::native_jbooleanArray n_is_descending(env, j_is_descending);
     jsize num_columns_is_desc = n_is_descending.size();
 
-    JNI_ARG_CHECK(env, num_columns_is_desc == num_columns,
-                  "columns and is_descending lengths don't match", 0);
+    JNI_ARG_CHECK(
+      env, num_columns_is_desc == num_columns, "columns and is_descending lengths don't match", 0);
 
     const cudf::jni::native_jbooleanArray n_are_nulls_smallest(env, j_are_nulls_smallest);
     jsize num_columns_null_smallest = n_are_nulls_smallest.size();
 
-    JNI_ARG_CHECK(env, num_columns_null_smallest == num_columns,
-                  "columns and areNullsSmallest lengths don't match", 0);
+    JNI_ARG_CHECK(env,
+                  num_columns_null_smallest == num_columns,
+                  "columns and areNullsSmallest lengths don't match",
+                  0);
 
     std::vector<cudf::order> order =
-        n_is_descending.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING);
+      n_is_descending.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING);
 
     std::vector<cudf::null_order> null_order =
-        n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER);
+      n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER);
 
     std::vector<cudf::column_view> sort_keys = n_sort_keys_columns.get_dereferenced();
     auto sorted_col = cudf::sorted_order(cudf::table_view{sort_keys}, order, null_order);
 
-    auto const input_table = reinterpret_cast<cudf::table_view const *>(j_input_table);
+    auto const input_table = reinterpret_cast<cudf::table_view const*>(j_input_table);
     return convert_table_for_return(env, cudf::gather(*input_table, sorted_col->view()));
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv* env,
+                                                             jclass,
                                                              jlongArray j_table_handles,
                                                              jintArray j_sort_key_indexes,
                                                              jbooleanArray j_is_descending,
-                                                             jbooleanArray j_are_nulls_smallest) {
-
+                                                             jbooleanArray j_are_nulls_smallest)
+{
   // input validations & verifications
   JNI_NULL_CHECK(env, j_table_handles, "input tables are null", NULL);
   JNI_NULL_CHECK(env, j_sort_key_indexes, "key indexes is null", NULL);
@@ -1156,20 +1271,24 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv *env, jclass
     const cudf::jni::native_jbooleanArray n_is_descending(env, j_is_descending);
     jsize num_columns_is_desc = n_is_descending.size();
 
-    JNI_ARG_CHECK(env, num_columns_is_desc == num_columns,
-                  "columns and is_descending lengths don't match", NULL);
+    JNI_ARG_CHECK(env,
+                  num_columns_is_desc == num_columns,
+                  "columns and is_descending lengths don't match",
+                  NULL);
 
     const cudf::jni::native_jbooleanArray n_are_nulls_smallest(env, j_are_nulls_smallest);
     jsize num_columns_null_smallest = n_are_nulls_smallest.size();
 
-    JNI_ARG_CHECK(env, num_columns_null_smallest == num_columns,
-                  "columns and areNullsSmallest lengths don't match", NULL);
+    JNI_ARG_CHECK(env,
+                  num_columns_null_smallest == num_columns,
+                  "columns and areNullsSmallest lengths don't match",
+                  NULL);
 
     std::vector<int> indexes = n_sort_key_indexes.to_vector<int>();
     std::vector<cudf::order> order =
-        n_is_descending.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING);
+      n_is_descending.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING);
     std::vector<cudf::null_order> null_order =
-        n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER);
+      n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER);
     std::vector<cudf::table_view> tables = n_table_handles.get_dereferenced();
 
     return convert_table_for_return(env, cudf::merge(tables, indexes, order, null_order));
@@ -1177,11 +1296,23 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv *env, jclass
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSVFromDataSource(
-    JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
-    jobjectArray filter_col_names, jint header_row, jbyte delim, jint j_quote_style, jbyte quote,
-    jbyte comment, jobjectArray null_values, jobjectArray true_values, jobjectArray false_values,
-    jlong ds_handle) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_readCSVFromDataSource(JNIEnv* env,
+                                                jclass,
+                                                jobjectArray col_names,
+                                                jintArray j_types,
+                                                jintArray j_scales,
+                                                jobjectArray filter_col_names,
+                                                jint header_row,
+                                                jbyte delim,
+                                                jint j_quote_style,
+                                                jbyte quote,
+                                                jbyte comment,
+                                                jobjectArray null_values,
+                                                jobjectArray true_values,
+                                                jobjectArray false_values,
+                                                jlong ds_handle)
+{
   JNI_NULL_CHECK(env, null_values, "null_values must be supplied, even if it is empty", NULL);
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", NULL);
 
@@ -1199,8 +1330,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSVFromDataSource(
         JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and scales must match size", NULL);
       }
       data_types.reserve(n_types.size());
-      std::transform(n_types.begin(), n_types.end(), n_scales.begin(),
-                     std::back_inserter(data_types), [](auto type, auto scale) {
+      std::transform(n_types.begin(),
+                     n_types.end(),
+                     n_scales.begin(),
+                     std::back_inserter(data_types),
+                     [](auto type, auto scale) {
                        return cudf::data_type{static_cast<cudf::type_id>(type), scale};
                      });
     }
@@ -1210,37 +1344,50 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSVFromDataSource(
     cudf::jni::native_jstringArray n_false_values(env, false_values);
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
 
-    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    auto ds = reinterpret_cast<cudf::io::datasource*>(ds_handle);
     cudf::io::source_info source{ds};
 
     auto const quote_style = static_cast<cudf::io::quote_style>(j_quote_style);
 
     cudf::io::csv_reader_options opts = cudf::io::csv_reader_options::builder(source)
-                                            .delimiter(delim)
-                                            .header(header_row)
-                                            .names(n_col_names.as_cpp_vector())
-                                            .dtypes(data_types)
-                                            .use_cols_names(n_filter_col_names.as_cpp_vector())
-                                            .true_values(n_true_values.as_cpp_vector())
-                                            .false_values(n_false_values.as_cpp_vector())
-                                            .na_values(n_null_values.as_cpp_vector())
-                                            .keep_default_na(false)
-                                            .na_filter(n_null_values.size() > 0)
-                                            .quoting(quote_style)
-                                            .quotechar(quote)
-                                            .comment(comment)
-                                            .build();
+                                          .delimiter(delim)
+                                          .header(header_row)
+                                          .names(n_col_names.as_cpp_vector())
+                                          .dtypes(data_types)
+                                          .use_cols_names(n_filter_col_names.as_cpp_vector())
+                                          .true_values(n_true_values.as_cpp_vector())
+                                          .false_values(n_false_values.as_cpp_vector())
+                                          .na_values(n_null_values.as_cpp_vector())
+                                          .keep_default_na(false)
+                                          .na_filter(n_null_values.size() > 0)
+                                          .quoting(quote_style)
+                                          .quotechar(quote)
+                                          .comment(comment)
+                                          .build();
 
     return convert_table_for_return(env, cudf::io::read_csv(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
-    JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
-    jobjectArray filter_col_names, jstring inputfilepath, jlong buffer, jlong buffer_length,
-    jint header_row, jbyte delim, jint j_quote_style, jbyte quote, jbyte comment,
-    jobjectArray null_values, jobjectArray true_values, jobjectArray false_values) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(JNIEnv* env,
+                                                               jclass,
+                                                               jobjectArray col_names,
+                                                               jintArray j_types,
+                                                               jintArray j_scales,
+                                                               jobjectArray filter_col_names,
+                                                               jstring inputfilepath,
+                                                               jlong buffer,
+                                                               jlong buffer_length,
+                                                               jint header_row,
+                                                               jbyte delim,
+                                                               jint j_quote_style,
+                                                               jbyte quote,
+                                                               jbyte comment,
+                                                               jobjectArray null_values,
+                                                               jobjectArray true_values,
+                                                               jobjectArray false_values)
+{
   JNI_NULL_CHECK(env, null_values, "null_values must be supplied, even if it is empty", NULL);
 
   bool read_buffer = true;
@@ -1248,8 +1395,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
     JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL);
     read_buffer = false;
   } else if (inputfilepath != NULL) {
-    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
-                  "cannot pass in both a buffer and an inputfilepath", NULL);
+    JNI_THROW_NEW(
+      env, cudf::jni::ILLEGAL_ARG_CLASS, "cannot pass in both a buffer and an inputfilepath", NULL);
   } else if (buffer_length <= 0) {
     JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", NULL);
   }
@@ -1268,8 +1415,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
         JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and scales must match size", NULL);
       }
       data_types.reserve(n_types.size());
-      std::transform(n_types.begin(), n_types.end(), n_scales.begin(),
-                     std::back_inserter(data_types), [](auto type, auto scale) {
+      std::transform(n_types.begin(),
+                     n_types.end(),
+                     n_scales.begin(),
+                     std::back_inserter(data_types),
+                     [](auto type, auto scale) {
                        return cudf::data_type{static_cast<cudf::type_id>(type), scale};
                      });
     }
@@ -1284,36 +1434,45 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
     cudf::jni::native_jstringArray n_false_values(env, false_values);
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
 
-    auto source = read_buffer ? cudf::io::source_info{reinterpret_cast<char *>(buffer),
-                                                      static_cast<std::size_t>(buffer_length)} :
-                                cudf::io::source_info{filename.get()};
+    auto source            = read_buffer ? cudf::io::source_info{reinterpret_cast<char*>(buffer),
+                                                      static_cast<std::size_t>(buffer_length)}
+                                         : cudf::io::source_info{filename.get()};
     auto const quote_style = static_cast<cudf::io::quote_style>(j_quote_style);
 
     cudf::io::csv_reader_options opts = cudf::io::csv_reader_options::builder(source)
-                                            .delimiter(delim)
-                                            .header(header_row)
-                                            .names(n_col_names.as_cpp_vector())
-                                            .dtypes(data_types)
-                                            .use_cols_names(n_filter_col_names.as_cpp_vector())
-                                            .true_values(n_true_values.as_cpp_vector())
-                                            .false_values(n_false_values.as_cpp_vector())
-                                            .na_values(n_null_values.as_cpp_vector())
-                                            .keep_default_na(false)
-                                            .na_filter(n_null_values.size() > 0)
-                                            .quoting(quote_style)
-                                            .quotechar(quote)
-                                            .comment(comment)
-                                            .build();
+                                          .delimiter(delim)
+                                          .header(header_row)
+                                          .names(n_col_names.as_cpp_vector())
+                                          .dtypes(data_types)
+                                          .use_cols_names(n_filter_col_names.as_cpp_vector())
+                                          .true_values(n_true_values.as_cpp_vector())
+                                          .false_values(n_false_values.as_cpp_vector())
+                                          .na_values(n_null_values.as_cpp_vector())
+                                          .keep_default_na(false)
+                                          .na_filter(n_null_values.size() > 0)
+                                          .quoting(quote_style)
+                                          .quotechar(quote)
+                                          .comment(comment)
+                                          .build();
 
     return convert_table_for_return(env, cudf::io::read_csv(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToFile(
-    JNIEnv *env, jclass, jlong j_table_handle, jobjectArray j_column_names, jboolean include_header,
-    jstring j_row_delimiter, jbyte j_field_delimiter, jstring j_null_value, jstring j_true_value,
-    jstring j_false_value, jint j_quote_style, jstring j_output_path) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToFile(JNIEnv* env,
+                                                                jclass,
+                                                                jlong j_table_handle,
+                                                                jobjectArray j_column_names,
+                                                                jboolean include_header,
+                                                                jstring j_row_delimiter,
+                                                                jbyte j_field_delimiter,
+                                                                jstring j_null_value,
+                                                                jstring j_true_value,
+                                                                jstring j_false_value,
+                                                                jint j_quote_style,
+                                                                jstring j_output_path)
+{
   JNI_NULL_CHECK(env, j_table_handle, "table handle cannot be null.", );
   JNI_NULL_CHECK(env, j_column_names, "column name array cannot be null", );
   JNI_NULL_CHECK(env, j_row_delimiter, "row delimiter cannot be null", );
@@ -1327,37 +1486,47 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToFile(
     cudf::jni::auto_set_device(env);
 
     auto const native_output_path = cudf::jni::native_jstring{env, j_output_path};
-    auto const output_path = native_output_path.get();
+    auto const output_path        = native_output_path.get();
 
-    auto const table = reinterpret_cast<cudf::table_view *>(j_table_handle);
+    auto const table          = reinterpret_cast<cudf::table_view*>(j_table_handle);
     auto const n_column_names = cudf::jni::native_jstringArray{env, j_column_names};
-    auto const column_names = n_column_names.as_cpp_vector();
+    auto const column_names   = n_column_names.as_cpp_vector();
 
     auto const line_terminator = cudf::jni::native_jstring{env, j_row_delimiter};
-    auto const na_rep = cudf::jni::native_jstring{env, j_null_value};
-    auto const true_value = cudf::jni::native_jstring{env, j_true_value};
-    auto const false_value = cudf::jni::native_jstring{env, j_false_value};
-    auto const quote_style = static_cast<cudf::io::quote_style>(j_quote_style);
+    auto const na_rep          = cudf::jni::native_jstring{env, j_null_value};
+    auto const true_value      = cudf::jni::native_jstring{env, j_true_value};
+    auto const false_value     = cudf::jni::native_jstring{env, j_false_value};
+    auto const quote_style     = static_cast<cudf::io::quote_style>(j_quote_style);
 
     auto options = cudf::io::csv_writer_options::builder(cudf::io::sink_info{output_path}, *table)
-                       .names(column_names)
-                       .include_header(static_cast<bool>(include_header))
-                       .line_terminator(line_terminator.get())
-                       .inter_column_delimiter(j_field_delimiter)
-                       .na_rep(na_rep.get())
-                       .true_value(true_value.get())
-                       .false_value(false_value.get())
-                       .quoting(quote_style);
+                     .names(column_names)
+                     .include_header(static_cast<bool>(include_header))
+                     .line_terminator(line_terminator.get())
+                     .inter_column_delimiter(j_field_delimiter)
+                     .na_rep(na_rep.get())
+                     .true_value(true_value.get())
+                     .false_value(false_value.get())
+                     .quoting(quote_style);
 
     cudf::io::write_csv(options.build());
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_startWriteCSVToBuffer(
-    JNIEnv *env, jclass, jobjectArray j_column_names, jboolean include_header,
-    jstring j_row_delimiter, jbyte j_field_delimiter, jstring j_null_value, jstring j_true_value,
-    jstring j_false_value, jint j_quote_style, jobject j_buffer, jobject host_memory_allocator) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_Table_startWriteCSVToBuffer(JNIEnv* env,
+                                                jclass,
+                                                jobjectArray j_column_names,
+                                                jboolean include_header,
+                                                jstring j_row_delimiter,
+                                                jbyte j_field_delimiter,
+                                                jstring j_null_value,
+                                                jstring j_true_value,
+                                                jstring j_false_value,
+                                                jint j_quote_style,
+                                                jobject j_buffer,
+                                                jobject host_memory_allocator)
+{
   JNI_NULL_CHECK(env, j_column_names, "column name array cannot be null", 0);
   JNI_NULL_CHECK(env, j_row_delimiter, "row delimiter cannot be null", 0);
   JNI_NULL_CHECK(env, j_field_delimiter, "field delimiter cannot be null", 0);
@@ -1368,42 +1537,44 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_startWriteCSVToBuffer(
     cudf::jni::auto_set_device(env);
 
     auto data_sink =
-        std::make_unique<cudf::jni::jni_writer_data_sink>(env, j_buffer, host_memory_allocator);
+      std::make_unique<cudf::jni::jni_writer_data_sink>(env, j_buffer, host_memory_allocator);
 
     auto const n_column_names = cudf::jni::native_jstringArray{env, j_column_names};
-    auto const column_names = n_column_names.as_cpp_vector();
+    auto const column_names   = n_column_names.as_cpp_vector();
 
     auto const line_terminator = cudf::jni::native_jstring{env, j_row_delimiter};
-    auto const na_rep = cudf::jni::native_jstring{env, j_null_value};
-    auto const true_value = cudf::jni::native_jstring{env, j_true_value};
-    auto const false_value = cudf::jni::native_jstring{env, j_false_value};
-    auto const quote_style = static_cast<cudf::io::quote_style>(j_quote_style);
+    auto const na_rep          = cudf::jni::native_jstring{env, j_null_value};
+    auto const true_value      = cudf::jni::native_jstring{env, j_true_value};
+    auto const false_value     = cudf::jni::native_jstring{env, j_false_value};
+    auto const quote_style     = static_cast<cudf::io::quote_style>(j_quote_style);
 
     auto options = cudf::io::csv_writer_options::builder(cudf::io::sink_info{data_sink.get()},
                                                          cudf::table_view{})
-                       .names(column_names)
-                       .include_header(static_cast<bool>(include_header))
-                       .line_terminator(line_terminator.get())
-                       .inter_column_delimiter(j_field_delimiter)
-                       .na_rep(na_rep.get())
-                       .true_value(true_value.get())
-                       .false_value(false_value.get())
-                       .quoting(quote_style)
-                       .build();
+                     .names(column_names)
+                     .include_header(static_cast<bool>(include_header))
+                     .line_terminator(line_terminator.get())
+                     .inter_column_delimiter(j_field_delimiter)
+                     .na_rep(na_rep.get())
+                     .true_value(true_value.get())
+                     .false_value(false_value.get())
+                     .quoting(quote_style)
+                     .build();
 
     return ptr_as_jlong(new cudf::jni::io::csv_chunked_writer{options, data_sink});
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVChunkToBuffer(JNIEnv *env, jclass,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVChunkToBuffer(JNIEnv* env,
+                                                                       jclass,
                                                                        jlong j_writer_handle,
-                                                                       jlong j_table_handle) {
+                                                                       jlong j_table_handle)
+{
   JNI_NULL_CHECK(env, j_writer_handle, "writer handle cannot be null.", );
   JNI_NULL_CHECK(env, j_table_handle, "table handle cannot be null.", );
 
-  auto const table = reinterpret_cast<cudf::table_view *>(j_table_handle);
-  auto writer = reinterpret_cast<cudf::jni::io::csv_chunked_writer *>(j_writer_handle);
+  auto const table = reinterpret_cast<cudf::table_view*>(j_table_handle);
+  auto writer      = reinterpret_cast<cudf::jni::io::csv_chunked_writer*>(j_writer_handle);
 
   try {
     cudf::jni::auto_set_device(env);
@@ -1412,13 +1583,15 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVChunkToBuffer(JNIEnv *e
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env, jclass,
-                                                                     jlong j_writer_handle) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv* env,
+                                                                     jclass,
+                                                                     jlong j_writer_handle)
+{
   JNI_NULL_CHECK(env, j_writer_handle, "writer handle cannot be null.", );
 
   using cudf::jni::io::csv_chunked_writer;
   auto writer =
-      std::unique_ptr<csv_chunked_writer>{reinterpret_cast<csv_chunked_writer *>(j_writer_handle)};
+    std::unique_ptr<csv_chunked_writer>{reinterpret_cast<csv_chunked_writer*>(j_writer_handle)};
 
   try {
     cudf::jni::auto_set_device(env);
@@ -1427,44 +1600,57 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(
-    JNIEnv *env, jclass, jboolean day_first, jboolean lines, jboolean recover_with_null,
-    jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string,
-    jboolean keep_quotes, jlong ds_handle) {
-
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
+                                                         jclass,
+                                                         jboolean day_first,
+                                                         jboolean lines,
+                                                         jboolean recover_with_null,
+                                                         jboolean normalize_single_quotes,
+                                                         jboolean normalize_whitespace,
+                                                         jboolean mixed_types_as_string,
+                                                         jboolean keep_quotes,
+                                                         jlong ds_handle)
+{
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    auto ds = reinterpret_cast<cudf::io::datasource*>(ds_handle);
     cudf::io::source_info source{ds};
 
-    auto const recovery_mode = recover_with_null ?
-                                   cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
-                                   cudf::io::json_recovery_mode_t::FAIL;
+    auto const recovery_mode = recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL
+                                                 : cudf::io::json_recovery_mode_t::FAIL;
     cudf::io::json_reader_options_builder opts =
-        cudf::io::json_reader_options::builder(source)
-            .dayfirst(static_cast<bool>(day_first))
-            .lines(static_cast<bool>(lines))
-            .recovery_mode(recovery_mode)
-            .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
-            .normalize_whitespace(static_cast<bool>(normalize_whitespace))
-            .mixed_types_as_string(mixed_types_as_string)
-            .keep_quotes(keep_quotes);
+      cudf::io::json_reader_options::builder(source)
+        .dayfirst(static_cast<bool>(day_first))
+        .lines(static_cast<bool>(lines))
+        .recovery_mode(recovery_mode)
+        .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
+        .normalize_whitespace(static_cast<bool>(normalize_whitespace))
+        .mixed_types_as_string(mixed_types_as_string)
+        .keep_quotes(keep_quotes);
 
     auto result =
-        std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
+      std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
 
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
-    JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
-    jboolean recover_with_null, jboolean normalize_single_quotes, jboolean normalize_whitespace,
-    jboolean mixed_types_as_string, jboolean keep_quotes) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
+                                                                   jclass,
+                                                                   jlong buffer,
+                                                                   jlong buffer_length,
+                                                                   jboolean day_first,
+                                                                   jboolean lines,
+                                                                   jboolean recover_with_null,
+                                                                   jboolean normalize_single_quotes,
+                                                                   jboolean normalize_whitespace,
+                                                                   jboolean mixed_types_as_string,
+                                                                   jboolean keep_quotes)
+{
   JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
   if (buffer_length <= 0) {
     JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", 0);
@@ -1473,50 +1659,52 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
   try {
     cudf::jni::auto_set_device(env);
 
-    auto source = cudf::io::source_info{reinterpret_cast<char *>(buffer),
+    auto source = cudf::io::source_info{reinterpret_cast<char*>(buffer),
                                         static_cast<std::size_t>(buffer_length)};
 
-    auto const recovery_mode = recover_with_null ?
-                                   cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
-                                   cudf::io::json_recovery_mode_t::FAIL;
+    auto const recovery_mode = recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL
+                                                 : cudf::io::json_recovery_mode_t::FAIL;
     cudf::io::json_reader_options_builder opts =
-        cudf::io::json_reader_options::builder(source)
-            .dayfirst(static_cast<bool>(day_first))
-            .lines(static_cast<bool>(lines))
-            .recovery_mode(recovery_mode)
-            .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
-            .normalize_whitespace(static_cast<bool>(normalize_whitespace))
-            .mixed_types_as_string(mixed_types_as_string)
-            .keep_quotes(keep_quotes);
+      cudf::io::json_reader_options::builder(source)
+        .dayfirst(static_cast<bool>(day_first))
+        .lines(static_cast<bool>(lines))
+        .recovery_mode(recovery_mode)
+        .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
+        .normalize_whitespace(static_cast<bool>(normalize_whitespace))
+        .mixed_types_as_string(mixed_types_as_string)
+        .keep_quotes(keep_quotes);
 
     auto result =
-        std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
+      std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
 
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_TableWithMeta_close(JNIEnv *env, jclass, jlong handle) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_TableWithMeta_close(JNIEnv* env, jclass, jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "handle is null", );
 
   try {
     cudf::jni::auto_set_device(env);
-    delete reinterpret_cast<cudf::io::table_with_metadata *>(handle);
+    delete reinterpret_cast<cudf::io::table_with_metadata*>(handle);
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jintArray JNICALL
-Java_ai_rapids_cudf_TableWithMeta_getFlattenedChildCounts(JNIEnv *env, jclass, jlong handle) {
+JNIEXPORT jintArray JNICALL Java_ai_rapids_cudf_TableWithMeta_getFlattenedChildCounts(JNIEnv* env,
+                                                                                      jclass,
+                                                                                      jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "handle is null", nullptr);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto ptr = reinterpret_cast<cudf::io::table_with_metadata *>(handle);
+    auto ptr = reinterpret_cast<cudf::io::table_with_metadata*>(handle);
     std::vector<int> counts;
     counts.push_back(ptr->metadata.schema_info.size());
-    for (cudf::io::column_name_info const &child : ptr->metadata.schema_info) {
+    for (cudf::io::column_name_info const& child : ptr->metadata.schema_info) {
       cudf::jni::append_flattened_child_counts(child, counts);
     }
 
@@ -1532,21 +1720,22 @@ Java_ai_rapids_cudf_TableWithMeta_getFlattenedChildCounts(JNIEnv *env, jclass, j
 }
 
 JNIEXPORT jobjectArray JNICALL
-Java_ai_rapids_cudf_TableWithMeta_getFlattenedColumnNames(JNIEnv *env, jclass, jlong handle) {
+Java_ai_rapids_cudf_TableWithMeta_getFlattenedColumnNames(JNIEnv* env, jclass, jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "handle is null", nullptr);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto ptr = reinterpret_cast<cudf::io::table_with_metadata *>(handle);
+    auto ptr = reinterpret_cast<cudf::io::table_with_metadata*>(handle);
     std::vector<std::string> names;
     names.push_back("ROOT");
-    for (cudf::io::column_name_info const &child : ptr->metadata.schema_info) {
+    for (cudf::io::column_name_info const& child : ptr->metadata.schema_info) {
       cudf::jni::append_flattened_child_names(child, names);
     }
 
     auto length = names.size();
-    auto ret = static_cast<jobjectArray>(
-        env->NewObjectArray(length, env->FindClass("java/lang/String"), nullptr));
+    auto ret    = static_cast<jobjectArray>(
+      env->NewObjectArray(length, env->FindClass("java/lang/String"), nullptr));
     for (size_t i = 0; i < length; i++) {
       env->SetObjectArrayElement(ret, i, env->NewStringUTF(names[i].c_str()));
     }
@@ -1556,13 +1745,15 @@ Java_ai_rapids_cudf_TableWithMeta_getFlattenedColumnNames(JNIEnv *env, jclass, j
   CATCH_STD(env, nullptr);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIEnv *env, jclass,
-                                                                            jlong handle) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIEnv* env,
+                                                                            jclass,
+                                                                            jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "handle is null", nullptr);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto ptr = reinterpret_cast<cudf::io::table_with_metadata *>(handle);
+    auto ptr = reinterpret_cast<cudf::io::table_with_metadata*>(handle);
     if (ptr->tbl) {
       return convert_table_for_return(env, ptr->tbl);
     } else {
@@ -1572,12 +1763,22 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE
   CATCH_STD(env, nullptr);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
-    JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types,
-    jintArray j_scales, jboolean day_first, jboolean lines, jboolean recover_with_null,
-    jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string,
-    jboolean keep_quotes, jlong ds_handle) {
-
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
+                                                 jclass,
+                                                 jintArray j_num_children,
+                                                 jobjectArray col_names,
+                                                 jintArray j_types,
+                                                 jintArray j_scales,
+                                                 jboolean day_first,
+                                                 jboolean lines,
+                                                 jboolean recover_with_null,
+                                                 jboolean normalize_single_quotes,
+                                                 jboolean normalize_whitespace,
+                                                 jboolean mixed_types_as_string,
+                                                 jboolean keep_quotes,
+                                                 jlong ds_handle)
+{
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
 
   try {
@@ -1596,41 +1797,41 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
       JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and num children must match null", 0);
     }
 
-    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    auto ds = reinterpret_cast<cudf::io::datasource*>(ds_handle);
     cudf::io::source_info source{ds};
 
     cudf::io::json_recovery_mode_t recovery_mode =
-        recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
-                            cudf::io::json_recovery_mode_t::FAIL;
+      recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL
+                        : cudf::io::json_recovery_mode_t::FAIL;
     cudf::io::json_reader_options_builder opts =
-        cudf::io::json_reader_options::builder(source)
-            .dayfirst(static_cast<bool>(day_first))
-            .lines(static_cast<bool>(lines))
-            .recovery_mode(recovery_mode)
-            .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
-            .normalize_whitespace(static_cast<bool>(normalize_whitespace))
-            .mixed_types_as_string(mixed_types_as_string)
-            .keep_quotes(keep_quotes);
+      cudf::io::json_reader_options::builder(source)
+        .dayfirst(static_cast<bool>(day_first))
+        .lines(static_cast<bool>(lines))
+        .recovery_mode(recovery_mode)
+        .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
+        .normalize_whitespace(static_cast<bool>(normalize_whitespace))
+        .mixed_types_as_string(mixed_types_as_string)
+        .keep_quotes(keep_quotes);
 
     if (!n_types.is_null()) {
       if (n_types.size() != n_scales.size()) {
         JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and scales must match size", 0);
       }
       if (n_col_names.size() != n_types.size()) {
-        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and column names must match size",
-                      0);
+        JNI_THROW_NEW(
+          env, cudf::jni::ILLEGAL_ARG_CLASS, "types and column names must match size", 0);
       }
       if (n_children.size() != n_types.size()) {
-        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and num children must match size",
-                      0);
+        JNI_THROW_NEW(
+          env, cudf::jni::ILLEGAL_ARG_CLASS, "types and num children must match size", 0);
       }
 
       std::map<std::string, cudf::io::schema_element> data_types;
       int at = 0;
       while (at < n_types.size()) {
         data_types.insert(std::pair{
-            n_col_names.get(at).get(),
-            cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)});
+          n_col_names.get(at).get(),
+          cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)});
       }
       opts.dtypes(data_types);
     } else {
@@ -1638,27 +1839,37 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
     }
 
     auto result =
-        std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
+      std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
 
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
-    JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types,
-    jintArray j_scales, jstring inputfilepath, jlong buffer, jlong buffer_length,
-    jboolean day_first, jboolean lines, jboolean recover_with_null,
-    jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string,
-    jboolean keep_quotes) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
+                                                           jclass,
+                                                           jintArray j_num_children,
+                                                           jobjectArray col_names,
+                                                           jintArray j_types,
+                                                           jintArray j_scales,
+                                                           jstring inputfilepath,
+                                                           jlong buffer,
+                                                           jlong buffer_length,
+                                                           jboolean day_first,
+                                                           jboolean lines,
+                                                           jboolean recover_with_null,
+                                                           jboolean normalize_single_quotes,
+                                                           jboolean normalize_whitespace,
+                                                           jboolean mixed_types_as_string,
+                                                           jboolean keep_quotes)
+{
   bool read_buffer = true;
   if (buffer == 0) {
     JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", 0);
     read_buffer = false;
   } else if (inputfilepath != NULL) {
-    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
-                  "cannot pass in both a buffer and an inputfilepath", 0);
+    JNI_THROW_NEW(
+      env, cudf::jni::ILLEGAL_ARG_CLASS, "cannot pass in both a buffer and an inputfilepath", 0);
   } else if (buffer_length <= 0) {
     JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", 0);
   }
@@ -1684,42 +1895,42 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
       JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "inputfilepath can't be empty", 0);
     }
 
-    auto source = read_buffer ? cudf::io::source_info{reinterpret_cast<char *>(buffer),
-                                                      static_cast<std::size_t>(buffer_length)} :
-                                cudf::io::source_info{filename.get()};
+    auto source = read_buffer ? cudf::io::source_info{reinterpret_cast<char*>(buffer),
+                                                      static_cast<std::size_t>(buffer_length)}
+                              : cudf::io::source_info{filename.get()};
 
     cudf::io::json_recovery_mode_t recovery_mode =
-        recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
-                            cudf::io::json_recovery_mode_t::FAIL;
+      recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL
+                        : cudf::io::json_recovery_mode_t::FAIL;
     cudf::io::json_reader_options_builder opts =
-        cudf::io::json_reader_options::builder(source)
-            .dayfirst(static_cast<bool>(day_first))
-            .lines(static_cast<bool>(lines))
-            .recovery_mode(recovery_mode)
-            .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
-            .normalize_whitespace(static_cast<bool>(normalize_whitespace))
-            .mixed_types_as_string(mixed_types_as_string)
-            .keep_quotes(keep_quotes);
+      cudf::io::json_reader_options::builder(source)
+        .dayfirst(static_cast<bool>(day_first))
+        .lines(static_cast<bool>(lines))
+        .recovery_mode(recovery_mode)
+        .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
+        .normalize_whitespace(static_cast<bool>(normalize_whitespace))
+        .mixed_types_as_string(mixed_types_as_string)
+        .keep_quotes(keep_quotes);
 
     if (!n_types.is_null()) {
       if (n_types.size() != n_scales.size()) {
         JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and scales must match size", 0);
       }
       if (n_col_names.size() != n_types.size()) {
-        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and column names must match size",
-                      0);
+        JNI_THROW_NEW(
+          env, cudf::jni::ILLEGAL_ARG_CLASS, "types and column names must match size", 0);
       }
       if (n_children.size() != n_types.size()) {
-        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and num children must match size",
-                      0);
+        JNI_THROW_NEW(
+          env, cudf::jni::ILLEGAL_ARG_CLASS, "types and num children must match size", 0);
       }
 
       std::map<std::string, cudf::io::schema_element> data_types;
       int at = 0;
       while (at < n_types.size()) {
         data_types.insert(std::pair{
-            n_col_names.get(at).get(),
-            cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)});
+          n_col_names.get(at).get(),
+          cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)});
       }
       opts.dtypes(data_types);
     } else {
@@ -1727,17 +1938,21 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
     }
 
     auto result =
-        std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
+      std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
 
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquetFromDataSource(
-    JNIEnv *env, jclass, jobjectArray filter_col_names, jbooleanArray j_col_binary_read, jint unit,
-    jlong ds_handle) {
-
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_readParquetFromDataSource(JNIEnv* env,
+                                                    jclass,
+                                                    jobjectArray filter_col_names,
+                                                    jbooleanArray j_col_binary_read,
+                                                    jint unit,
+                                                    jlong ds_handle)
+{
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
   JNI_NULL_CHECK(env, j_col_binary_read, "null col_binary_read", 0);
 
@@ -1747,7 +1962,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquetFromDataSource
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
     cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
 
-    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    auto ds = reinterpret_cast<cudf::io::datasource*>(ds_handle);
     cudf::io::source_info source{ds};
 
     auto builder = cudf::io::parquet_reader_options::builder(source);
@@ -1756,26 +1971,31 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquetFromDataSource
     }
 
     cudf::io::parquet_reader_options opts =
-        builder.convert_strings_to_categories(false)
-            .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
-            .build();
+      builder.convert_strings_to_categories(false)
+        .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+        .build();
     return convert_table_for_return(env, cudf::io::read_parquet(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(
-    JNIEnv *env, jclass, jobjectArray filter_col_names, jbooleanArray j_col_binary_read,
-    jstring inputfilepath, jlong buffer, jlong buffer_length, jint unit) {
-
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv* env,
+                                                                   jclass,
+                                                                   jobjectArray filter_col_names,
+                                                                   jbooleanArray j_col_binary_read,
+                                                                   jstring inputfilepath,
+                                                                   jlong buffer,
+                                                                   jlong buffer_length,
+                                                                   jint unit)
+{
   JNI_NULL_CHECK(env, j_col_binary_read, "null col_binary_read", 0);
   bool read_buffer = true;
   if (buffer == 0) {
     JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL);
     read_buffer = false;
   } else if (inputfilepath != NULL) {
-    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
-                  "cannot pass in both a buffer and an inputfilepath", NULL);
+    JNI_THROW_NEW(
+      env, cudf::jni::ILLEGAL_ARG_CLASS, "cannot pass in both a buffer and an inputfilepath", NULL);
   } else if (buffer_length <= 0) {
     JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", NULL);
   }
@@ -1790,9 +2010,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
     cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
 
-    auto source = read_buffer ? cudf::io::source_info(reinterpret_cast<char *>(buffer),
-                                                      static_cast<std::size_t>(buffer_length)) :
-                                cudf::io::source_info(filename.get());
+    auto source = read_buffer ? cudf::io::source_info(reinterpret_cast<char*>(buffer),
+                                                      static_cast<std::size_t>(buffer_length))
+                              : cudf::io::source_info(filename.get());
 
     auto builder = cudf::io::parquet_reader_options::builder(source);
     if (n_filter_col_names.size() > 0) {
@@ -1800,17 +2020,17 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(
     }
 
     cudf::io::parquet_reader_options opts =
-        builder.convert_strings_to_categories(false)
-            .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
-            .build();
+      builder.convert_strings_to_categories(false)
+        .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+        .build();
     return convert_table_for_return(env, cudf::io::read_parquet(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvroFromDataSource(
-    JNIEnv *env, jclass, jobjectArray filter_col_names, jlong ds_handle) {
-
+  JNIEnv* env, jclass, jobjectArray filter_col_names, jlong ds_handle)
+{
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
 
   try {
@@ -1818,28 +2038,30 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvroFromDataSource(
 
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
 
-    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    auto ds = reinterpret_cast<cudf::io::datasource*>(ds_handle);
     cudf::io::source_info source{ds};
 
     cudf::io::avro_reader_options opts = cudf::io::avro_reader_options::builder(source)
-                                             .columns(n_filter_col_names.as_cpp_vector())
-                                             .build();
+                                           .columns(n_filter_col_names.as_cpp_vector())
+                                           .build();
     return convert_table_for_return(env, cudf::io::read_avro(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvro(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvro(JNIEnv* env,
+                                                                jclass,
                                                                 jobjectArray filter_col_names,
-                                                                jstring inputfilepath, jlong buffer,
-                                                                jlong buffer_length) {
-
+                                                                jstring inputfilepath,
+                                                                jlong buffer,
+                                                                jlong buffer_length)
+{
   const bool read_buffer = (buffer != 0);
   if (!read_buffer) {
     JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL);
   } else if (inputfilepath != NULL) {
-    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
-                  "cannot pass in both a buffer and an inputfilepath", NULL);
+    JNI_THROW_NEW(
+      env, cudf::jni::ILLEGAL_ARG_CLASS, "cannot pass in both a buffer and an inputfilepath", NULL);
   } else if (buffer_length <= 0) {
     JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", NULL);
   }
@@ -1853,24 +2075,38 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvro(JNIEnv *env, jcl
 
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
 
-    auto source = read_buffer ? cudf::io::source_info(reinterpret_cast<char *>(buffer),
-                                                      static_cast<std::size_t>(buffer_length)) :
-                                cudf::io::source_info(filename.get());
+    auto source = read_buffer ? cudf::io::source_info(reinterpret_cast<char*>(buffer),
+                                                      static_cast<std::size_t>(buffer_length))
+                              : cudf::io::source_info(filename.get());
 
     cudf::io::avro_reader_options opts = cudf::io::avro_reader_options::builder(source)
-                                             .columns(n_filter_col_names.as_cpp_vector())
-                                             .build();
+                                           .columns(n_filter_col_names.as_cpp_vector())
+                                           .build();
     return convert_table_for_return(env, cudf::io::read_avro(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
-    JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
-    jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
-    jint j_compression, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions,
-    jbooleanArray j_is_map, jbooleanArray j_is_binary, jbooleanArray j_hasParquetFieldIds,
-    jintArray j_parquetFieldIds, jobject consumer, jobject host_memory_allocator) {
+JNIEXPORT long JNICALL
+Java_ai_rapids_cudf_Table_writeParquetBufferBegin(JNIEnv* env,
+                                                  jclass,
+                                                  jobjectArray j_col_names,
+                                                  jint j_num_children,
+                                                  jintArray j_children,
+                                                  jbooleanArray j_col_nullability,
+                                                  jobjectArray j_metadata_keys,
+                                                  jobjectArray j_metadata_values,
+                                                  jint j_compression,
+                                                  jint j_stats_freq,
+                                                  jbooleanArray j_isInt96,
+                                                  jintArray j_precisions,
+                                                  jbooleanArray j_is_map,
+                                                  jbooleanArray j_is_binary,
+                                                  jbooleanArray j_hasParquetFieldIds,
+                                                  jintArray j_parquetFieldIds,
+                                                  jobject consumer,
+                                                  jobject host_memory_allocator)
+{
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
@@ -1878,23 +2114,34 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
   JNI_NULL_CHECK(env, consumer, "null consumer", 0);
   try {
     std::unique_ptr<cudf::jni::jni_writer_data_sink> data_sink(
-        new cudf::jni::jni_writer_data_sink(env, consumer, host_memory_allocator));
+      new cudf::jni::jni_writer_data_sink(env, consumer, host_memory_allocator));
 
     using namespace cudf::io;
     using namespace cudf::jni;
     sink_info sink{data_sink.get()};
     table_input_metadata metadata;
-    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_isInt96,
-                        j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds,
+    createTableMetaData(env,
+                        j_num_children,
+                        j_col_names,
+                        j_children,
+                        j_col_nullability,
+                        j_isInt96,
+                        j_precisions,
+                        j_is_map,
+                        metadata,
+                        j_hasParquetFieldIds,
+                        j_parquetFieldIds,
                         j_is_binary);
 
-    auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
+    auto meta_keys   = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
     auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector();
 
     std::map<std::string, std::string> kv_metadata;
-    std::transform(meta_keys.begin(), meta_keys.end(), meta_values.begin(),
+    std::transform(meta_keys.begin(),
+                   meta_keys.end(),
+                   meta_values.begin(),
                    std::inserter(kv_metadata, kv_metadata.end()),
-                   [](auto const &key, auto const &value) {
+                   [](auto const& key, auto const& value) {
                      // The metadata value will be ignored if it is empty.
                      // We modify it into a space character to workaround such issue.
                      return std::make_pair(key, value.empty() ? std::string(" ") : value);
@@ -1902,27 +2149,40 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
 
     auto stats = std::make_shared<cudf::io::writer_compression_statistics>();
     chunked_parquet_writer_options opts =
-        chunked_parquet_writer_options::builder(sink)
-            .metadata(std::move(metadata))
-            .compression(static_cast<compression_type>(j_compression))
-            .stats_level(static_cast<statistics_freq>(j_stats_freq))
-            .key_value_metadata({kv_metadata})
-            .compression_statistics(stats)
-            .build();
+      chunked_parquet_writer_options::builder(sink)
+        .metadata(std::move(metadata))
+        .compression(static_cast<compression_type>(j_compression))
+        .stats_level(static_cast<statistics_freq>(j_stats_freq))
+        .key_value_metadata({kv_metadata})
+        .compression_statistics(stats)
+        .build();
     auto writer_ptr = std::make_unique<cudf::io::parquet_chunked_writer>(opts);
-    cudf::jni::native_parquet_writer_handle *ret = new cudf::jni::native_parquet_writer_handle(
-        std::move(writer_ptr), std::move(data_sink), std::move(stats));
+    cudf::jni::native_parquet_writer_handle* ret = new cudf::jni::native_parquet_writer_handle(
+      std::move(writer_ptr), std::move(data_sink), std::move(stats));
     return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
-    JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
-    jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
-    jint j_compression, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions,
-    jbooleanArray j_is_map, jbooleanArray j_is_binary, jbooleanArray j_hasParquetFieldIds,
-    jintArray j_parquetFieldIds, jstring j_output_path) {
+JNIEXPORT long JNICALL
+Java_ai_rapids_cudf_Table_writeParquetFileBegin(JNIEnv* env,
+                                                jclass,
+                                                jobjectArray j_col_names,
+                                                jint j_num_children,
+                                                jintArray j_children,
+                                                jbooleanArray j_col_nullability,
+                                                jobjectArray j_metadata_keys,
+                                                jobjectArray j_metadata_values,
+                                                jint j_compression,
+                                                jint j_stats_freq,
+                                                jbooleanArray j_isInt96,
+                                                jintArray j_precisions,
+                                                jbooleanArray j_is_map,
+                                                jbooleanArray j_is_binary,
+                                                jbooleanArray j_hasParquetFieldIds,
+                                                jintArray j_parquetFieldIds,
+                                                jstring j_output_path)
+{
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
@@ -1934,17 +2194,28 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
     using namespace cudf::io;
     using namespace cudf::jni;
     table_input_metadata metadata;
-    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_isInt96,
-                        j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds,
+    createTableMetaData(env,
+                        j_num_children,
+                        j_col_names,
+                        j_children,
+                        j_col_nullability,
+                        j_isInt96,
+                        j_precisions,
+                        j_is_map,
+                        metadata,
+                        j_hasParquetFieldIds,
+                        j_parquetFieldIds,
                         j_is_binary);
 
-    auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
+    auto meta_keys   = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
     auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector();
 
     std::map<std::string, std::string> kv_metadata;
-    std::transform(meta_keys.begin(), meta_keys.end(), meta_values.begin(),
+    std::transform(meta_keys.begin(),
+                   meta_keys.end(),
+                   meta_values.begin(),
                    std::inserter(kv_metadata, kv_metadata.end()),
-                   [](auto const &key, auto const &value) {
+                   [](auto const& key, auto const& value) {
                      // The metadata value will be ignored if it is empty.
                      // We modify it into a space character to workaround such issue.
                      return std::make_pair(key, value.empty() ? std::string(" ") : value);
@@ -1953,33 +2224,33 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
     sink_info sink{output_path.get()};
     auto stats = std::make_shared<cudf::io::writer_compression_statistics>();
     chunked_parquet_writer_options opts =
-        chunked_parquet_writer_options::builder(sink)
-            .metadata(std::move(metadata))
-            .compression(static_cast<compression_type>(j_compression))
-            .stats_level(static_cast<statistics_freq>(j_stats_freq))
-            .key_value_metadata({kv_metadata})
-            .compression_statistics(stats)
-            .build();
+      chunked_parquet_writer_options::builder(sink)
+        .metadata(std::move(metadata))
+        .compression(static_cast<compression_type>(j_compression))
+        .stats_level(static_cast<statistics_freq>(j_stats_freq))
+        .key_value_metadata({kv_metadata})
+        .compression_statistics(stats)
+        .build();
 
     auto writer_ptr = std::make_unique<cudf::io::parquet_chunked_writer>(opts);
-    cudf::jni::native_parquet_writer_handle *ret = new cudf::jni::native_parquet_writer_handle(
-        std::move(writer_ptr), nullptr, std::move(stats));
+    cudf::jni::native_parquet_writer_handle* ret =
+      new cudf::jni::native_parquet_writer_handle(std::move(writer_ptr), nullptr, std::move(stats));
     return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetChunk(JNIEnv *env, jclass,
-                                                                   jlong j_state, jlong j_table,
-                                                                   jlong mem_size) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetChunk(
+  JNIEnv* env, jclass, jlong j_state, jlong j_table, jlong mem_size)
+{
   JNI_NULL_CHECK(env, j_table, "null table", );
   JNI_NULL_CHECK(env, j_state, "null state", );
 
   using namespace cudf::io;
-  cudf::table_view *tview_with_empty_nullmask = reinterpret_cast<cudf::table_view *>(j_table);
+  cudf::table_view* tview_with_empty_nullmask = reinterpret_cast<cudf::table_view*>(j_table);
   cudf::table_view tview = cudf::jni::remove_validity_if_needed(tview_with_empty_nullmask);
-  cudf::jni::native_parquet_writer_handle *state =
-      reinterpret_cast<cudf::jni::native_parquet_writer_handle *>(j_state);
+  cudf::jni::native_parquet_writer_handle* state =
+    reinterpret_cast<cudf::jni::native_parquet_writer_handle*>(j_state);
 
   if (state->sink) {
     long alloc_size = std::max(cudf::jni::MINIMUM_WRITE_BUFFER_SIZE, mem_size / 2);
@@ -1992,13 +2263,13 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetChunk(JNIEnv *env,
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetEnd(JNIEnv *env, jclass,
-                                                                 jlong j_state) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetEnd(JNIEnv* env, jclass, jlong j_state)
+{
   JNI_NULL_CHECK(env, j_state, "null state", );
 
   using namespace cudf::io;
-  cudf::jni::native_parquet_writer_handle *state =
-      reinterpret_cast<cudf::jni::native_parquet_writer_handle *>(j_state);
+  cudf::jni::native_parquet_writer_handle* state =
+    reinterpret_cast<cudf::jni::native_parquet_writer_handle*>(j_state);
   std::unique_ptr<cudf::jni::native_parquet_writer_handle> make_sure_we_delete(state);
   try {
     cudf::jni::auto_set_device(env);
@@ -2007,10 +2278,15 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetEnd(JNIEnv *env, jc
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORCFromDataSource(
-    JNIEnv *env, jclass, jobjectArray filter_col_names, jboolean usingNumPyTypes, jint unit,
-    jobjectArray dec128_col_names, jlong ds_handle) {
-
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_readORCFromDataSource(JNIEnv* env,
+                                                jclass,
+                                                jobjectArray filter_col_names,
+                                                jboolean usingNumPyTypes,
+                                                jint unit,
+                                                jobjectArray dec128_col_names,
+                                                jlong ds_handle)
+{
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
 
   try {
@@ -2020,7 +2296,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORCFromDataSource(
 
     cudf::jni::native_jstringArray n_dec128_col_names(env, dec128_col_names);
 
-    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    auto ds = reinterpret_cast<cudf::io::datasource*>(ds_handle);
     cudf::io::source_info source{ds};
 
     auto builder = cudf::io::orc_reader_options::builder(source);
@@ -2029,26 +2305,33 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORCFromDataSource(
     }
 
     cudf::io::orc_reader_options opts =
-        builder.use_index(false)
-            .use_np_dtypes(static_cast<bool>(usingNumPyTypes))
-            .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
-            .decimal128_columns(n_dec128_col_names.as_cpp_vector())
-            .build();
+      builder.use_index(false)
+        .use_np_dtypes(static_cast<bool>(usingNumPyTypes))
+        .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+        .decimal128_columns(n_dec128_col_names.as_cpp_vector())
+        .build();
     return convert_table_for_return(env, cudf::io::read_orc(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
-    JNIEnv *env, jclass, jobjectArray filter_col_names, jstring inputfilepath, jlong buffer,
-    jlong buffer_length, jboolean usingNumPyTypes, jint unit, jobjectArray dec128_col_names) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(JNIEnv* env,
+                                                               jclass,
+                                                               jobjectArray filter_col_names,
+                                                               jstring inputfilepath,
+                                                               jlong buffer,
+                                                               jlong buffer_length,
+                                                               jboolean usingNumPyTypes,
+                                                               jint unit,
+                                                               jobjectArray dec128_col_names)
+{
   bool read_buffer = true;
   if (buffer == 0) {
     JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL);
     read_buffer = false;
   } else if (inputfilepath != NULL) {
-    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
-                  "cannot pass in both a buffer and an inputfilepath", NULL);
+    JNI_THROW_NEW(
+      env, cudf::jni::ILLEGAL_ARG_CLASS, "cannot pass in both a buffer and an inputfilepath", NULL);
   } else if (buffer_length <= 0) {
     JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", NULL);
   }
@@ -2064,9 +2347,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
 
     cudf::jni::native_jstringArray n_dec128_col_names(env, dec128_col_names);
 
-    auto source = read_buffer ?
-                      cudf::io::source_info(reinterpret_cast<char *>(buffer), buffer_length) :
-                      cudf::io::source_info(filename.get());
+    auto source = read_buffer
+                    ? cudf::io::source_info(reinterpret_cast<char*>(buffer), buffer_length)
+                    : cudf::io::source_info(filename.get());
 
     auto builder = cudf::io::orc_reader_options::builder(source);
     if (n_filter_col_names.size() > 0) {
@@ -2074,21 +2357,31 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
     }
 
     cudf::io::orc_reader_options opts =
-        builder.use_index(false)
-            .use_np_dtypes(static_cast<bool>(usingNumPyTypes))
-            .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
-            .decimal128_columns(n_dec128_col_names.as_cpp_vector())
-            .build();
+      builder.use_index(false)
+        .use_np_dtypes(static_cast<bool>(usingNumPyTypes))
+        .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+        .decimal128_columns(n_dec128_col_names.as_cpp_vector())
+        .build();
     return convert_table_for_return(env, cudf::io::read_orc(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
-    JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
-    jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
-    jint j_compression, jintArray j_precisions, jbooleanArray j_is_map, jobject consumer,
-    jobject host_memory_allocator) {
+JNIEXPORT long JNICALL
+Java_ai_rapids_cudf_Table_writeORCBufferBegin(JNIEnv* env,
+                                              jclass,
+                                              jobjectArray j_col_names,
+                                              jint j_num_children,
+                                              jintArray j_children,
+                                              jbooleanArray j_col_nullability,
+                                              jobjectArray j_metadata_keys,
+                                              jobjectArray j_metadata_values,
+                                              jint j_compression,
+                                              jintArray j_precisions,
+                                              jbooleanArray j_is_map,
+                                              jobject consumer,
+                                              jobject host_memory_allocator)
+{
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
@@ -2103,46 +2396,66 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
     jbooleanArray j_is_int96 = NULL;
     // ORC has no `j_parquetFieldIds`, but `createTableMetaData` needs a lvalue.
     jbooleanArray j_hasParquetFieldIds = NULL;
-    jintArray j_parquetFieldIds = NULL;
+    jintArray j_parquetFieldIds        = NULL;
     // temp stub
     jbooleanArray j_is_binary = NULL;
 
-    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_is_int96,
-                        j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds,
+    createTableMetaData(env,
+                        j_num_children,
+                        j_col_names,
+                        j_children,
+                        j_col_nullability,
+                        j_is_int96,
+                        j_precisions,
+                        j_is_map,
+                        metadata,
+                        j_hasParquetFieldIds,
+                        j_parquetFieldIds,
                         j_is_binary);
 
-    auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
+    auto meta_keys   = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
     auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector();
 
     std::map<std::string, std::string> kv_metadata;
-    std::transform(meta_keys.begin(), meta_keys.end(), meta_values.begin(),
+    std::transform(meta_keys.begin(),
+                   meta_keys.end(),
+                   meta_values.begin(),
                    std::inserter(kv_metadata, kv_metadata.end()),
-                   [](const std::string &k, const std::string &v) { return std::make_pair(k, v); });
+                   [](const std::string& k, const std::string& v) { return std::make_pair(k, v); });
 
     std::unique_ptr<cudf::jni::jni_writer_data_sink> data_sink(
-        new cudf::jni::jni_writer_data_sink(env, consumer, host_memory_allocator));
+      new cudf::jni::jni_writer_data_sink(env, consumer, host_memory_allocator));
     sink_info sink{data_sink.get()};
 
-    auto stats = std::make_shared<cudf::io::writer_compression_statistics>();
+    auto stats                      = std::make_shared<cudf::io::writer_compression_statistics>();
     chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink)
-                                          .metadata(std::move(metadata))
-                                          .compression(static_cast<compression_type>(j_compression))
-                                          .enable_statistics(ORC_STATISTICS_ROW_GROUP)
-                                          .key_value_metadata(kv_metadata)
-                                          .compression_statistics(stats)
-                                          .build();
-    auto writer_ptr = std::make_unique<cudf::io::orc_chunked_writer>(opts);
-    cudf::jni::native_orc_writer_handle *ret = new cudf::jni::native_orc_writer_handle(
-        std::move(writer_ptr), std::move(data_sink), std::move(stats));
+                                        .metadata(std::move(metadata))
+                                        .compression(static_cast<compression_type>(j_compression))
+                                        .enable_statistics(ORC_STATISTICS_ROW_GROUP)
+                                        .key_value_metadata(kv_metadata)
+                                        .compression_statistics(stats)
+                                        .build();
+    auto writer_ptr                          = std::make_unique<cudf::io::orc_chunked_writer>(opts);
+    cudf::jni::native_orc_writer_handle* ret = new cudf::jni::native_orc_writer_handle(
+      std::move(writer_ptr), std::move(data_sink), std::move(stats));
     return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(
-    JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
-    jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
-    jint j_compression, jintArray j_precisions, jbooleanArray j_is_map, jstring j_output_path) {
+JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(JNIEnv* env,
+                                                                   jclass,
+                                                                   jobjectArray j_col_names,
+                                                                   jint j_num_children,
+                                                                   jintArray j_children,
+                                                                   jbooleanArray j_col_nullability,
+                                                                   jobjectArray j_metadata_keys,
+                                                                   jobjectArray j_metadata_values,
+                                                                   jint j_compression,
+                                                                   jintArray j_precisions,
+                                                                   jbooleanArray j_is_map,
+                                                                   jstring j_output_path)
+{
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
@@ -2158,48 +2471,60 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(
     jbooleanArray j_is_int96 = NULL;
     // ORC has no `j_parquetFieldIds`, but `createTableMetaData` needs a lvalue.
     jbooleanArray j_hasParquetFieldIds = NULL;
-    jintArray j_parquetFieldIds = NULL;
+    jintArray j_parquetFieldIds        = NULL;
     // temp stub
     jbooleanArray j_is_binary = NULL;
-    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_is_int96,
-                        j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds,
+    createTableMetaData(env,
+                        j_num_children,
+                        j_col_names,
+                        j_children,
+                        j_col_nullability,
+                        j_is_int96,
+                        j_precisions,
+                        j_is_map,
+                        metadata,
+                        j_hasParquetFieldIds,
+                        j_parquetFieldIds,
                         j_is_binary);
 
-    auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
+    auto meta_keys   = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
     auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector();
 
     std::map<std::string, std::string> kv_metadata;
-    std::transform(meta_keys.begin(), meta_keys.end(), meta_values.begin(),
+    std::transform(meta_keys.begin(),
+                   meta_keys.end(),
+                   meta_values.begin(),
                    std::inserter(kv_metadata, kv_metadata.end()),
-                   [](const std::string &k, const std::string &v) { return std::make_pair(k, v); });
+                   [](const std::string& k, const std::string& v) { return std::make_pair(k, v); });
 
     sink_info sink{output_path.get()};
-    auto stats = std::make_shared<cudf::io::writer_compression_statistics>();
+    auto stats                      = std::make_shared<cudf::io::writer_compression_statistics>();
     chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink)
-                                          .metadata(std::move(metadata))
-                                          .compression(static_cast<compression_type>(j_compression))
-                                          .enable_statistics(ORC_STATISTICS_ROW_GROUP)
-                                          .key_value_metadata(kv_metadata)
-                                          .compression_statistics(stats)
-                                          .build();
+                                        .metadata(std::move(metadata))
+                                        .compression(static_cast<compression_type>(j_compression))
+                                        .enable_statistics(ORC_STATISTICS_ROW_GROUP)
+                                        .key_value_metadata(kv_metadata)
+                                        .compression_statistics(stats)
+                                        .build();
     auto writer_ptr = std::make_unique<cudf::io::orc_chunked_writer>(opts);
-    cudf::jni::native_orc_writer_handle *ret =
-        new cudf::jni::native_orc_writer_handle(std::move(writer_ptr), nullptr, std::move(stats));
+    cudf::jni::native_orc_writer_handle* ret =
+      new cudf::jni::native_orc_writer_handle(std::move(writer_ptr), nullptr, std::move(stats));
     return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCChunk(JNIEnv *env, jclass, jlong j_state,
-                                                               jlong j_table, jlong mem_size) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCChunk(
+  JNIEnv* env, jclass, jlong j_state, jlong j_table, jlong mem_size)
+{
   JNI_NULL_CHECK(env, j_table, "null table", );
   JNI_NULL_CHECK(env, j_state, "null state", );
 
   using namespace cudf::io;
-  cudf::table_view *tview_orig = reinterpret_cast<cudf::table_view *>(j_table);
-  cudf::table_view tview = cudf::jni::remove_validity_if_needed(tview_orig);
-  cudf::jni::native_orc_writer_handle *state =
-      reinterpret_cast<cudf::jni::native_orc_writer_handle *>(j_state);
+  cudf::table_view* tview_orig = reinterpret_cast<cudf::table_view*>(j_table);
+  cudf::table_view tview       = cudf::jni::remove_validity_if_needed(tview_orig);
+  cudf::jni::native_orc_writer_handle* state =
+    reinterpret_cast<cudf::jni::native_orc_writer_handle*>(j_state);
 
   if (state->sink) {
     long alloc_size = std::max(cudf::jni::MINIMUM_WRITE_BUFFER_SIZE, mem_size / 2);
@@ -2212,12 +2537,13 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCChunk(JNIEnv *env, jcla
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCEnd(JNIEnv *env, jclass, jlong j_state) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCEnd(JNIEnv* env, jclass, jlong j_state)
+{
   JNI_NULL_CHECK(env, j_state, "null state", );
 
   using namespace cudf::io;
-  cudf::jni::native_orc_writer_handle *state =
-      reinterpret_cast<cudf::jni::native_orc_writer_handle *>(j_state);
+  cudf::jni::native_orc_writer_handle* state =
+    reinterpret_cast<cudf::jni::native_orc_writer_handle*>(j_state);
   std::unique_ptr<cudf::jni::native_orc_writer_handle> make_sure_we_delete(state);
   try {
     cudf::jni::auto_set_device(env);
@@ -2226,25 +2552,24 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCEnd(JNIEnv *env, jclass
   CATCH_STD(env, )
 }
 
-JNIEXPORT jdoubleArray JNICALL Java_ai_rapids_cudf_TableWriter_getWriteStatistics(JNIEnv *env,
+JNIEXPORT jdoubleArray JNICALL Java_ai_rapids_cudf_TableWriter_getWriteStatistics(JNIEnv* env,
                                                                                   jclass,
-                                                                                  jlong j_state) {
+                                                                                  jlong j_state)
+{
   JNI_NULL_CHECK(env, j_state, "null state", nullptr);
 
   using namespace cudf::io;
-  auto const state = reinterpret_cast<cudf::jni::jni_table_writer_handle_base const *>(j_state);
+  auto const state = reinterpret_cast<cudf::jni::jni_table_writer_handle_base const*>(j_state);
   try {
     cudf::jni::auto_set_device(env);
-    if (!state->stats) {
-      return nullptr;
-    }
+    if (!state->stats) { return nullptr; }
 
-    auto const &stats = *state->stats;
-    auto output = cudf::jni::native_jdoubleArray(env, 4);
-    output[0] = static_cast<jdouble>(stats.num_compressed_bytes());
-    output[1] = static_cast<jdouble>(stats.num_failed_bytes());
-    output[2] = static_cast<jdouble>(stats.num_skipped_bytes());
-    output[3] = static_cast<jdouble>(stats.compression_ratio());
+    auto const& stats = *state->stats;
+    auto output       = cudf::jni::native_jdoubleArray(env, 4);
+    output[0]         = static_cast<jdouble>(stats.num_compressed_bytes());
+    output[1]         = static_cast<jdouble>(stats.num_failed_bytes());
+    output[2]         = static_cast<jdouble>(stats.num_skipped_bytes());
+    output[3]         = static_cast<jdouble>(stats.compression_ratio());
 
     return output.get_jArray();
   }
@@ -2252,8 +2577,8 @@ JNIEXPORT jdoubleArray JNICALL Java_ai_rapids_cudf_TableWriter_getWriteStatistic
 }
 
 JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCBufferBegin(
-    JNIEnv *env, jclass, jobjectArray j_col_names, jobject consumer,
-    jobject host_memory_allocator) {
+  JNIEnv* env, jclass, jobjectArray j_col_names, jobject consumer, jobject host_memory_allocator)
+{
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, consumer, "null consumer", 0);
   try {
@@ -2261,18 +2586,20 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCBufferBegin(
     cudf::jni::native_jstringArray col_names(env, j_col_names);
 
     std::shared_ptr<cudf::jni::jni_arrow_output_stream> data_sink(
-        new cudf::jni::jni_arrow_output_stream(env, consumer, host_memory_allocator));
+      new cudf::jni::jni_arrow_output_stream(env, consumer, host_memory_allocator));
 
-    cudf::jni::native_arrow_ipc_writer_handle *ret =
-        new cudf::jni::native_arrow_ipc_writer_handle(col_names.as_cpp_vector(), data_sink);
+    cudf::jni::native_arrow_ipc_writer_handle* ret =
+      new cudf::jni::native_arrow_ipc_writer_handle(col_names.as_cpp_vector(), data_sink);
     return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCFileBegin(JNIEnv *env, jclass,
+JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCFileBegin(JNIEnv* env,
+                                                                        jclass,
                                                                         jobjectArray j_col_names,
-                                                                        jstring j_output_path) {
+                                                                        jstring j_output_path)
+{
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_output_path, "null output path", 0);
   try {
@@ -2280,22 +2607,24 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCFileBegin(JNIEnv *
     cudf::jni::native_jstringArray col_names(env, j_col_names);
     cudf::jni::native_jstring output_path(env, j_output_path);
 
-    cudf::jni::native_arrow_ipc_writer_handle *ret =
-        new cudf::jni::native_arrow_ipc_writer_handle(col_names.as_cpp_vector(), output_path.get());
+    cudf::jni::native_arrow_ipc_writer_handle* ret =
+      new cudf::jni::native_arrow_ipc_writer_handle(col_names.as_cpp_vector(), output_path.get());
     return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_convertCudfToArrowTable(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_convertCudfToArrowTable(JNIEnv* env,
+                                                                          jclass,
                                                                           jlong j_state,
-                                                                          jlong j_table) {
+                                                                          jlong j_table)
+{
   JNI_NULL_CHECK(env, j_table, "null table", 0);
   JNI_NULL_CHECK(env, j_state, "null state", 0);
 
-  cudf::table_view *tview = reinterpret_cast<cudf::table_view *>(j_table);
-  cudf::jni::native_arrow_ipc_writer_handle *state =
-      reinterpret_cast<cudf::jni::native_arrow_ipc_writer_handle *>(j_state);
+  cudf::table_view* tview = reinterpret_cast<cudf::table_view*>(j_table);
+  cudf::jni::native_arrow_ipc_writer_handle* state =
+    reinterpret_cast<cudf::jni::native_arrow_ipc_writer_handle*>(j_state);
 
   try {
     cudf::jni::auto_set_device(env);
@@ -2311,17 +2640,16 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_convertCudfToArrowTable(JNIEnv
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCArrowChunk(JNIEnv *env, jclass,
-                                                                         jlong j_state,
-                                                                         jlong arrow_table_handle,
-                                                                         jlong max_chunk) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCArrowChunk(
+  JNIEnv* env, jclass, jlong j_state, jlong arrow_table_handle, jlong max_chunk)
+{
   JNI_NULL_CHECK(env, arrow_table_handle, "null arrow table", );
   JNI_NULL_CHECK(env, j_state, "null state", );
 
-  std::shared_ptr<arrow::Table> *handle =
-      reinterpret_cast<std::shared_ptr<arrow::Table> *>(arrow_table_handle);
-  cudf::jni::native_arrow_ipc_writer_handle *state =
-      reinterpret_cast<cudf::jni::native_arrow_ipc_writer_handle *>(j_state);
+  std::shared_ptr<arrow::Table>* handle =
+    reinterpret_cast<std::shared_ptr<arrow::Table>*>(arrow_table_handle);
+  cudf::jni::native_arrow_ipc_writer_handle* state =
+    reinterpret_cast<cudf::jni::native_arrow_ipc_writer_handle*>(j_state);
 
   try {
     cudf::jni::auto_set_device(env);
@@ -2330,12 +2658,14 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCArrowChunk(JNIEnv
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCEnd(JNIEnv *env, jclass,
-                                                                  jlong j_state) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCEnd(JNIEnv* env,
+                                                                  jclass,
+                                                                  jlong j_state)
+{
   JNI_NULL_CHECK(env, j_state, "null state", );
 
-  cudf::jni::native_arrow_ipc_writer_handle *state =
-      reinterpret_cast<cudf::jni::native_arrow_ipc_writer_handle *>(j_state);
+  cudf::jni::native_arrow_ipc_writer_handle* state =
+    reinterpret_cast<cudf::jni::native_arrow_ipc_writer_handle*>(j_state);
   std::unique_ptr<cudf::jni::native_arrow_ipc_writer_handle> make_sure_we_delete(state);
   try {
     cudf::jni::auto_set_device(env);
@@ -2344,8 +2674,10 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCEnd(JNIEnv *env, j
   CATCH_STD(env, )
 }
 
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCFileBegin(JNIEnv *env, jclass,
-                                                                       jstring j_input_path) {
+JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCFileBegin(JNIEnv* env,
+                                                                       jclass,
+                                                                       jstring j_input_path)
+{
   JNI_NULL_CHECK(env, j_input_path, "null input path", 0);
   try {
     cudf::jni::auto_set_device(env);
@@ -2355,25 +2687,29 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCFileBegin(JNIEnv *e
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCBufferBegin(JNIEnv *env, jclass,
-                                                                         jobject provider) {
+JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCBufferBegin(JNIEnv* env,
+                                                                         jclass,
+                                                                         jobject provider)
+{
   JNI_NULL_CHECK(env, provider, "null provider", 0);
   try {
     cudf::jni::auto_set_device(env);
     std::shared_ptr<cudf::jni::jni_arrow_input_stream> data_source(
-        new cudf::jni::jni_arrow_input_stream(env, provider));
+      new cudf::jni::jni_arrow_input_stream(env, provider));
     return ptr_as_jlong(new cudf::jni::native_arrow_ipc_reader_handle(data_source));
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readArrowIPCChunkToArrowTable(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readArrowIPCChunkToArrowTable(JNIEnv* env,
+                                                                                jclass,
                                                                                 jlong j_state,
-                                                                                jint row_target) {
+                                                                                jint row_target)
+{
   JNI_NULL_CHECK(env, j_state, "null state", 0);
 
-  cudf::jni::native_arrow_ipc_reader_handle *state =
-      reinterpret_cast<cudf::jni::native_arrow_ipc_reader_handle *>(j_state);
+  cudf::jni::native_arrow_ipc_reader_handle* state =
+    reinterpret_cast<cudf::jni::native_arrow_ipc_reader_handle*>(j_state);
 
   try {
     cudf::jni::auto_set_device(env);
@@ -2385,10 +2721,12 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readArrowIPCChunkToArrowTable(
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_closeArrowTable(JNIEnv *env, jclass,
-                                                                 jlong arrow_table_handle) {
-  std::shared_ptr<arrow::Table> *handle =
-      reinterpret_cast<std::shared_ptr<arrow::Table> *>(arrow_table_handle);
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_closeArrowTable(JNIEnv* env,
+                                                                 jclass,
+                                                                 jlong arrow_table_handle)
+{
+  std::shared_ptr<arrow::Table>* handle =
+    reinterpret_cast<std::shared_ptr<arrow::Table>*>(arrow_table_handle);
 
   try {
     cudf::jni::auto_set_device(env);
@@ -2398,11 +2736,12 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_closeArrowTable(JNIEnv *env, jc
 }
 
 JNIEXPORT jlongArray JNICALL
-Java_ai_rapids_cudf_Table_convertArrowTableToCudf(JNIEnv *env, jclass, jlong arrow_table_handle) {
+Java_ai_rapids_cudf_Table_convertArrowTableToCudf(JNIEnv* env, jclass, jlong arrow_table_handle)
+{
   JNI_NULL_CHECK(env, arrow_table_handle, "null arrow handle", 0);
 
-  std::shared_ptr<arrow::Table> *handle =
-      reinterpret_cast<std::shared_ptr<arrow::Table> *>(arrow_table_handle);
+  std::shared_ptr<arrow::Table>* handle =
+    reinterpret_cast<std::shared_ptr<arrow::Table>*>(arrow_table_handle);
 
   try {
     cudf::jni::auto_set_device(env);
@@ -2411,12 +2750,12 @@ Java_ai_rapids_cudf_Table_convertArrowTableToCudf(JNIEnv *env, jclass, jlong arr
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_readArrowIPCEnd(JNIEnv *env, jclass,
-                                                                 jlong j_state) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_readArrowIPCEnd(JNIEnv* env, jclass, jlong j_state)
+{
   JNI_NULL_CHECK(env, j_state, "null state", );
 
-  cudf::jni::native_arrow_ipc_reader_handle *state =
-      reinterpret_cast<cudf::jni::native_arrow_ipc_reader_handle *>(j_state);
+  cudf::jni::native_arrow_ipc_reader_handle* state =
+    reinterpret_cast<cudf::jni::native_arrow_ipc_reader_handle*>(j_state);
   std::unique_ptr<cudf::jni::native_arrow_ipc_reader_handle> make_sure_we_delete(state);
   try {
     cudf::jni::auto_set_device(env);
@@ -2426,523 +2765,772 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_readArrowIPCEnd(JNIEnv *env, jc
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  JNIEnv* env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal)
+{
   return cudf::jni::join_gather_maps(
-      env, j_left_keys, j_right_keys, compare_nulls_equal,
-      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
-        return cudf::left_join(left, right, nulleq);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    compare_nulls_equal,
+    [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
+      return cudf::left_join(left, right, nulleq);
+    });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftDistinctJoinGatherMap(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  JNIEnv* env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal)
+{
   return cudf::jni::join_gather_single_map(
-      env, j_left_keys, j_right_keys, compare_nulls_equal,
-      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
-        auto has_nulls = cudf::has_nested_nulls(left) || cudf::has_nested_nulls(right) ?
-                             cudf::nullable_join::YES :
-                             cudf::nullable_join::NO;
-        if (cudf::detail::has_nested_columns(right)) {
-          cudf::distinct_hash_join<cudf::has_nested::YES> hash(right, left, has_nulls, nulleq);
-          return hash.left_join();
-        } else {
-          cudf::distinct_hash_join<cudf::has_nested::NO> hash(right, left, has_nulls, nulleq);
-          return hash.left_join();
-        }
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    compare_nulls_equal,
+    [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
+      auto has_nulls = cudf::has_nested_nulls(left) || cudf::has_nested_nulls(right)
+                         ? cudf::nullable_join::YES
+                         : cudf::nullable_join::NO;
+      if (cudf::detail::has_nested_columns(right)) {
+        cudf::distinct_hash_join<cudf::has_nested::YES> hash(right, left, has_nulls, nulleq);
+        return hash.left_join();
+      } else {
+        cudf::distinct_hash_join<cudf::has_nested::NO> hash(right, left, has_nulls, nulleq);
+        return hash.left_join();
+      }
+    });
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_leftJoinRowCount(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_leftJoinRowCount(JNIEnv* env,
+                                                                   jclass,
                                                                    jlong j_left_table,
-                                                                   jlong j_right_hash_join) {
+                                                                   jlong j_right_hash_join)
+{
   JNI_NULL_CHECK(env, j_left_table, "left table is null", 0);
   JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto hash_join = reinterpret_cast<cudf::hash_join const *>(j_right_hash_join);
-    auto row_count = hash_join->left_join_size(*left_table);
+    auto left_table = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto hash_join  = reinterpret_cast<cudf::hash_join const*>(j_right_hash_join);
+    auto row_count  = hash_join->left_join_size(*left_table);
     return static_cast<jlong>(row_count);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftHashJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join) {
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_hash_join)
+{
   return cudf::jni::hash_join_gather_maps(
-      env, j_left_table, j_right_hash_join,
-      [](cudf::table_view const &left, cudf::hash_join const &hash) {
-        return hash.left_join(left);
-      });
+    env,
+    j_left_table,
+    j_right_hash_join,
+    [](cudf::table_view const& left, cudf::hash_join const& hash) { return hash.left_join(left); });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftHashJoinGatherMapsWithCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count) {
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count)
+{
   auto output_row_count = static_cast<std::size_t>(j_output_row_count);
   return cudf::jni::hash_join_gather_maps(
-      env, j_left_table, j_right_hash_join,
-      [output_row_count](cudf::table_view const &left, cudf::hash_join const &hash) {
-        return hash.left_join(left, output_row_count);
-      });
+    env,
+    j_left_table,
+    j_right_hash_join,
+    [output_row_count](cudf::table_view const& left, cudf::hash_join const& hash) {
+      return hash.left_join(left, output_row_count);
+    });
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinRowCount(JNIEnv *env, jclass,
-                                                                              jlong j_left_table,
-                                                                              jlong j_right_table,
-                                                                              jlong j_condition) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinRowCount(
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
   JNI_NULL_CHECK(env, j_left_table, "left_table is null", 0);
   JNI_NULL_CHECK(env, j_right_table, "right_table is null", 0);
   JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
-    auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
-    auto row_count = cudf::conditional_left_join_size(*left_table, *right_table,
-                                                      condition->get_top_expression());
+    auto left_table  = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto right_table = reinterpret_cast<cudf::table_view const*>(j_right_table);
+    auto condition   = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_condition);
+    auto row_count =
+      cudf::conditional_left_join_size(*left_table, *right_table, condition->get_top_expression());
     return static_cast<jlong>(row_count);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition) {
-  return cudf::jni::cond_join_gather_maps(
-      env, j_left_table, j_right_table, j_condition,
-      [](cudf::table_view const &left, cudf::table_view const &right,
-         cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_left_join(left, right, cond_expr);
-      });
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
+  return cudf::jni::cond_join_gather_maps(env,
+                                          j_left_table,
+                                          j_right_table,
+                                          j_condition,
+                                          [](cudf::table_view const& left,
+                                             cudf::table_view const& right,
+                                             cudf::ast::expression const& cond_expr) {
+                                            return cudf::conditional_left_join(
+                                              left, right, cond_expr);
+                                          });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinGatherMapsWithCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
-    jlong j_row_count) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_conditionalLeftJoinGatherMapsWithCount(JNIEnv* env,
+                                                                 jclass,
+                                                                 jlong j_left_table,
+                                                                 jlong j_right_table,
+                                                                 jlong j_condition,
+                                                                 jlong j_row_count)
+{
   auto row_count = static_cast<std::size_t>(j_row_count);
-  return cudf::jni::cond_join_gather_maps(
-      env, j_left_table, j_right_table, j_condition,
-      [row_count](cudf::table_view const &left, cudf::table_view const &right,
-                  cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_left_join(left, right, cond_expr, row_count);
-      });
-}
-
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftJoinSize(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+  return cudf::jni::cond_join_gather_maps(env,
+                                          j_left_table,
+                                          j_right_table,
+                                          j_condition,
+                                          [row_count](cudf::table_view const& left,
+                                                      cudf::table_view const& right,
+                                                      cudf::ast::expression const& cond_expr) {
+                                            return cudf::conditional_left_join(
+                                              left, right, cond_expr, row_count);
+                                          });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftJoinSize(JNIEnv* env,
+                                                                         jclass,
+                                                                         jlong j_left_keys,
+                                                                         jlong j_right_keys,
+                                                                         jlong j_left_condition,
+                                                                         jlong j_right_condition,
+                                                                         jlong j_condition,
+                                                                         jboolean j_nulls_equal)
+{
   return cudf::jni::mixed_join_size(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_join_size(left_keys, right_keys, left_condition, right_condition,
-                                          condition, nulls_equal);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [](cudf::table_view const& left_keys,
+       cudf::table_view const& right_keys,
+       cudf::table_view const& left_condition,
+       cudf::table_view const& right_condition,
+       cudf::ast::expression const& condition,
+       cudf::null_equality nulls_equal) {
+      return cudf::mixed_left_join_size(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal);
+    });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_mixedLeftJoinGatherMaps(JNIEnv* env,
+                                                  jclass,
+                                                  jlong j_left_keys,
+                                                  jlong j_right_keys,
+                                                  jlong j_left_condition,
+                                                  jlong j_right_condition,
+                                                  jlong j_condition,
+                                                  jboolean j_nulls_equal)
+{
   return cudf::jni::mixed_join_gather_maps(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_join(left_keys, right_keys, left_condition, right_condition,
-                                     condition, nulls_equal);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [](cudf::table_view const& left_keys,
+       cudf::table_view const& right_keys,
+       cudf::table_view const& left_condition,
+       cudf::table_view const& right_condition,
+       cudf::ast::expression const& condition,
+       cudf::null_equality nulls_equal) {
+      return cudf::mixed_left_join(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal);
+    });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftJoinGatherMapsWithSize(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal, jlong j_output_row_count,
-    jlong j_matches_view) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_mixedLeftJoinGatherMapsWithSize(JNIEnv* env,
+                                                          jclass,
+                                                          jlong j_left_keys,
+                                                          jlong j_right_keys,
+                                                          jlong j_left_condition,
+                                                          jlong j_right_condition,
+                                                          jlong j_condition,
+                                                          jboolean j_nulls_equal,
+                                                          jlong j_output_row_count,
+                                                          jlong j_matches_view)
+{
   auto size_info = cudf::jni::get_mixed_size_info(env, j_output_row_count, j_matches_view);
   return cudf::jni::mixed_join_gather_maps(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [&size_info](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-                   cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-                   cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_join(left_keys, right_keys, left_condition, right_condition,
-                                     condition, nulls_equal, size_info);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [&size_info](cudf::table_view const& left_keys,
+                 cudf::table_view const& right_keys,
+                 cudf::table_view const& left_condition,
+                 cudf::table_view const& right_condition,
+                 cudf::ast::expression const& condition,
+                 cudf::null_equality nulls_equal) {
+      return cudf::mixed_left_join(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal, size_info);
+    });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  JNIEnv* env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal)
+{
   return cudf::jni::join_gather_maps(
-      env, j_left_keys, j_right_keys, compare_nulls_equal,
-      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
-        return cudf::inner_join(left, right, nulleq);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    compare_nulls_equal,
+    [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
+      return cudf::inner_join(left, right, nulleq);
+    });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerDistinctJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  JNIEnv* env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal)
+{
   return cudf::jni::join_gather_maps(
-      env, j_left_keys, j_right_keys, compare_nulls_equal,
-      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
-        auto has_nulls = cudf::has_nested_nulls(left) || cudf::has_nested_nulls(right) ?
-                             cudf::nullable_join::YES :
-                             cudf::nullable_join::NO;
-        std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
-                  std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
-            maps;
-        if (cudf::detail::has_nested_columns(right)) {
-          cudf::distinct_hash_join<cudf::has_nested::YES> hash(right, left, has_nulls, nulleq);
-          maps = hash.inner_join();
-        } else {
-          cudf::distinct_hash_join<cudf::has_nested::NO> hash(right, left, has_nulls, nulleq);
-          maps = hash.inner_join();
-        }
-        // Unique join returns {right map, left map} but all the other joins
-        // return {left map, right map}. Swap here to make it consistent.
-        return std::make_pair(std::move(maps.second), std::move(maps.first));
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    compare_nulls_equal,
+    [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
+      auto has_nulls = cudf::has_nested_nulls(left) || cudf::has_nested_nulls(right)
+                         ? cudf::nullable_join::YES
+                         : cudf::nullable_join::NO;
+      std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
+                std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
+        maps;
+      if (cudf::detail::has_nested_columns(right)) {
+        cudf::distinct_hash_join<cudf::has_nested::YES> hash(right, left, has_nulls, nulleq);
+        maps = hash.inner_join();
+      } else {
+        cudf::distinct_hash_join<cudf::has_nested::NO> hash(right, left, has_nulls, nulleq);
+        maps = hash.inner_join();
+      }
+      // Unique join returns {right map, left map} but all the other joins
+      // return {left map, right map}. Swap here to make it consistent.
+      return std::make_pair(std::move(maps.second), std::move(maps.first));
+    });
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_innerJoinRowCount(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_innerJoinRowCount(JNIEnv* env,
+                                                                    jclass,
                                                                     jlong j_left_table,
-                                                                    jlong j_right_hash_join) {
+                                                                    jlong j_right_hash_join)
+{
   JNI_NULL_CHECK(env, j_left_table, "left table is null", 0);
   JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto hash_join = reinterpret_cast<cudf::hash_join const *>(j_right_hash_join);
-    auto row_count = hash_join->inner_join_size(*left_table);
+    auto left_table = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto hash_join  = reinterpret_cast<cudf::hash_join const*>(j_right_hash_join);
+    auto row_count  = hash_join->inner_join_size(*left_table);
     return static_cast<jlong>(row_count);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerHashJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join) {
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_hash_join)
+{
   return cudf::jni::hash_join_gather_maps(
-      env, j_left_table, j_right_hash_join,
-      [](cudf::table_view const &left, cudf::hash_join const &hash) {
-        return hash.inner_join(left);
-      });
+    env,
+    j_left_table,
+    j_right_hash_join,
+    [](cudf::table_view const& left, cudf::hash_join const& hash) {
+      return hash.inner_join(left);
+    });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerHashJoinGatherMapsWithCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count) {
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count)
+{
   auto output_row_count = static_cast<std::size_t>(j_output_row_count);
   return cudf::jni::hash_join_gather_maps(
-      env, j_left_table, j_right_hash_join,
-      [output_row_count](cudf::table_view const &left, cudf::hash_join const &hash) {
-        return hash.inner_join(left, output_row_count);
-      });
+    env,
+    j_left_table,
+    j_right_hash_join,
+    [output_row_count](cudf::table_view const& left, cudf::hash_join const& hash) {
+      return hash.inner_join(left, output_row_count);
+    });
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinRowCount(JNIEnv *env, jclass,
-                                                                               jlong j_left_table,
-                                                                               jlong j_right_table,
-                                                                               jlong j_condition) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinRowCount(
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
   JNI_NULL_CHECK(env, j_left_table, "left_table is null", 0);
   JNI_NULL_CHECK(env, j_right_table, "right_table is null", 0);
   JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
-    auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
-    auto row_count = cudf::conditional_inner_join_size(*left_table, *right_table,
-                                                       condition->get_top_expression());
+    auto left_table  = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto right_table = reinterpret_cast<cudf::table_view const*>(j_right_table);
+    auto condition   = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_condition);
+    auto row_count =
+      cudf::conditional_inner_join_size(*left_table, *right_table, condition->get_top_expression());
     return static_cast<jlong>(row_count);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition) {
-  return cudf::jni::cond_join_gather_maps(
-      env, j_left_table, j_right_table, j_condition,
-      [](cudf::table_view const &left, cudf::table_view const &right,
-         cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_inner_join(left, right, cond_expr);
-      });
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
+  return cudf::jni::cond_join_gather_maps(env,
+                                          j_left_table,
+                                          j_right_table,
+                                          j_condition,
+                                          [](cudf::table_view const& left,
+                                             cudf::table_view const& right,
+                                             cudf::ast::expression const& cond_expr) {
+                                            return cudf::conditional_inner_join(
+                                              left, right, cond_expr);
+                                          });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinGatherMapsWithCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
-    jlong j_row_count) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_conditionalInnerJoinGatherMapsWithCount(JNIEnv* env,
+                                                                  jclass,
+                                                                  jlong j_left_table,
+                                                                  jlong j_right_table,
+                                                                  jlong j_condition,
+                                                                  jlong j_row_count)
+{
   auto row_count = static_cast<std::size_t>(j_row_count);
-  return cudf::jni::cond_join_gather_maps(
-      env, j_left_table, j_right_table, j_condition,
-      [row_count](cudf::table_view const &left, cudf::table_view const &right,
-                  cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_inner_join(left, right, cond_expr, row_count);
-      });
-}
-
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedInnerJoinSize(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+  return cudf::jni::cond_join_gather_maps(env,
+                                          j_left_table,
+                                          j_right_table,
+                                          j_condition,
+                                          [row_count](cudf::table_view const& left,
+                                                      cudf::table_view const& right,
+                                                      cudf::ast::expression const& cond_expr) {
+                                            return cudf::conditional_inner_join(
+                                              left, right, cond_expr, row_count);
+                                          });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedInnerJoinSize(JNIEnv* env,
+                                                                          jclass,
+                                                                          jlong j_left_keys,
+                                                                          jlong j_right_keys,
+                                                                          jlong j_left_condition,
+                                                                          jlong j_right_condition,
+                                                                          jlong j_condition,
+                                                                          jboolean j_nulls_equal)
+{
   return cudf::jni::mixed_join_size(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_inner_join_size(left_keys, right_keys, left_condition, right_condition,
-                                           condition, nulls_equal);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [](cudf::table_view const& left_keys,
+       cudf::table_view const& right_keys,
+       cudf::table_view const& left_condition,
+       cudf::table_view const& right_condition,
+       cudf::ast::expression const& condition,
+       cudf::null_equality nulls_equal) {
+      return cudf::mixed_inner_join_size(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal);
+    });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedInnerJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_mixedInnerJoinGatherMaps(JNIEnv* env,
+                                                   jclass,
+                                                   jlong j_left_keys,
+                                                   jlong j_right_keys,
+                                                   jlong j_left_condition,
+                                                   jlong j_right_condition,
+                                                   jlong j_condition,
+                                                   jboolean j_nulls_equal)
+{
   return cudf::jni::mixed_join_gather_maps(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_inner_join(left_keys, right_keys, left_condition, right_condition,
-                                      condition, nulls_equal);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [](cudf::table_view const& left_keys,
+       cudf::table_view const& right_keys,
+       cudf::table_view const& left_condition,
+       cudf::table_view const& right_condition,
+       cudf::ast::expression const& condition,
+       cudf::null_equality nulls_equal) {
+      return cudf::mixed_inner_join(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal);
+    });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedInnerJoinGatherMapsWithSize(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal, jlong j_output_row_count,
-    jlong j_matches_view) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_mixedInnerJoinGatherMapsWithSize(JNIEnv* env,
+                                                           jclass,
+                                                           jlong j_left_keys,
+                                                           jlong j_right_keys,
+                                                           jlong j_left_condition,
+                                                           jlong j_right_condition,
+                                                           jlong j_condition,
+                                                           jboolean j_nulls_equal,
+                                                           jlong j_output_row_count,
+                                                           jlong j_matches_view)
+{
   auto size_info = cudf::jni::get_mixed_size_info(env, j_output_row_count, j_matches_view);
   return cudf::jni::mixed_join_gather_maps(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [&size_info](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-                   cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-                   cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_inner_join(left_keys, right_keys, left_condition, right_condition,
-                                      condition, nulls_equal, size_info);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [&size_info](cudf::table_view const& left_keys,
+                 cudf::table_view const& right_keys,
+                 cudf::table_view const& left_condition,
+                 cudf::table_view const& right_condition,
+                 cudf::ast::expression const& condition,
+                 cudf::null_equality nulls_equal) {
+      return cudf::mixed_inner_join(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal, size_info);
+    });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  JNIEnv* env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal)
+{
   return cudf::jni::join_gather_maps(
-      env, j_left_keys, j_right_keys, compare_nulls_equal,
-      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
-        return cudf::full_join(left, right, nulleq);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    compare_nulls_equal,
+    [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
+      return cudf::full_join(left, right, nulleq);
+    });
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_fullJoinRowCount(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_fullJoinRowCount(JNIEnv* env,
+                                                                   jclass,
                                                                    jlong j_left_table,
-                                                                   jlong j_right_hash_join) {
+                                                                   jlong j_right_hash_join)
+{
   JNI_NULL_CHECK(env, j_left_table, "left table is null", 0);
   JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto hash_join = reinterpret_cast<cudf::hash_join const *>(j_right_hash_join);
-    auto row_count = hash_join->full_join_size(*left_table);
+    auto left_table = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto hash_join  = reinterpret_cast<cudf::hash_join const*>(j_right_hash_join);
+    auto row_count  = hash_join->full_join_size(*left_table);
     return static_cast<jlong>(row_count);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullHashJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join) {
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_hash_join)
+{
   return cudf::jni::hash_join_gather_maps(
-      env, j_left_table, j_right_hash_join,
-      [](cudf::table_view const &left, cudf::hash_join const &hash) {
-        return hash.full_join(left);
-      });
+    env,
+    j_left_table,
+    j_right_hash_join,
+    [](cudf::table_view const& left, cudf::hash_join const& hash) { return hash.full_join(left); });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullHashJoinGatherMapsWithCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count) {
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count)
+{
   auto output_row_count = static_cast<std::size_t>(j_output_row_count);
   return cudf::jni::hash_join_gather_maps(
-      env, j_left_table, j_right_hash_join,
-      [output_row_count](cudf::table_view const &left, cudf::hash_join const &hash) {
-        return hash.full_join(left, output_row_count);
-      });
+    env,
+    j_left_table,
+    j_right_hash_join,
+    [output_row_count](cudf::table_view const& left, cudf::hash_join const& hash) {
+      return hash.full_join(left, output_row_count);
+    });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalFullJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition) {
-  return cudf::jni::cond_join_gather_maps(
-      env, j_left_table, j_right_table, j_condition,
-      [](cudf::table_view const &left, cudf::table_view const &right,
-         cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_full_join(left, right, cond_expr);
-      });
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
+  return cudf::jni::cond_join_gather_maps(env,
+                                          j_left_table,
+                                          j_right_table,
+                                          j_condition,
+                                          [](cudf::table_view const& left,
+                                             cudf::table_view const& right,
+                                             cudf::ast::expression const& cond_expr) {
+                                            return cudf::conditional_full_join(
+                                              left, right, cond_expr);
+                                          });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedFullJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_mixedFullJoinGatherMaps(JNIEnv* env,
+                                                  jclass,
+                                                  jlong j_left_keys,
+                                                  jlong j_right_keys,
+                                                  jlong j_left_condition,
+                                                  jlong j_right_condition,
+                                                  jlong j_condition,
+                                                  jboolean j_nulls_equal)
+{
   return cudf::jni::mixed_join_gather_maps(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_full_join(left_keys, right_keys, left_condition, right_condition,
-                                     condition, nulls_equal);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [](cudf::table_view const& left_keys,
+       cudf::table_view const& right_keys,
+       cudf::table_view const& left_condition,
+       cudf::table_view const& right_condition,
+       cudf::ast::expression const& condition,
+       cudf::null_equality nulls_equal) {
+      return cudf::mixed_full_join(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal);
+    });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftSemiJoinGatherMap(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  JNIEnv* env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal)
+{
   return cudf::jni::join_gather_single_map(
-      env, j_left_keys, j_right_keys, compare_nulls_equal,
-      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
-        return cudf::left_semi_join(left, right, nulleq);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    compare_nulls_equal,
+    [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
+      return cudf::left_semi_join(left, right, nulleq);
+    });
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinRowCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition) {
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
   JNI_NULL_CHECK(env, j_left_table, "left_table is null", 0);
   JNI_NULL_CHECK(env, j_right_table, "right_table is null", 0);
   JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
-    auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
-    auto row_count = cudf::conditional_left_semi_join_size(*left_table, *right_table,
-                                                           condition->get_top_expression());
+    auto left_table  = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto right_table = reinterpret_cast<cudf::table_view const*>(j_right_table);
+    auto condition   = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_condition);
+    auto row_count   = cudf::conditional_left_semi_join_size(
+      *left_table, *right_table, condition->get_top_expression());
     return static_cast<jlong>(row_count);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinGatherMap(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition) {
-  return cudf::jni::cond_join_gather_single_map(
-      env, j_left_table, j_right_table, j_condition,
-      [](cudf::table_view const &left, cudf::table_view const &right,
-         cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_left_semi_join(left, right, cond_expr);
-      });
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
+  return cudf::jni::cond_join_gather_single_map(env,
+                                                j_left_table,
+                                                j_right_table,
+                                                j_condition,
+                                                [](cudf::table_view const& left,
+                                                   cudf::table_view const& right,
+                                                   cudf::ast::expression const& cond_expr) {
+                                                  return cudf::conditional_left_semi_join(
+                                                    left, right, cond_expr);
+                                                });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinGatherMapWithCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
-    jlong j_row_count) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinGatherMapWithCount(JNIEnv* env,
+                                                                    jclass,
+                                                                    jlong j_left_table,
+                                                                    jlong j_right_table,
+                                                                    jlong j_condition,
+                                                                    jlong j_row_count)
+{
   auto row_count = static_cast<std::size_t>(j_row_count);
   return cudf::jni::cond_join_gather_single_map(
-      env, j_left_table, j_right_table, j_condition,
-      [row_count](cudf::table_view const &left, cudf::table_view const &right,
-                  cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_left_semi_join(left, right, cond_expr, row_count);
-      });
+    env,
+    j_left_table,
+    j_right_table,
+    j_condition,
+    [row_count](cudf::table_view const& left,
+                cudf::table_view const& right,
+                cudf::ast::expression const& cond_expr) {
+      return cudf::conditional_left_semi_join(left, right, cond_expr, row_count);
+    });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftSemiJoinGatherMap(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_mixedLeftSemiJoinGatherMap(JNIEnv* env,
+                                                     jclass,
+                                                     jlong j_left_keys,
+                                                     jlong j_right_keys,
+                                                     jlong j_left_condition,
+                                                     jlong j_right_condition,
+                                                     jlong j_condition,
+                                                     jboolean j_nulls_equal)
+{
   return cudf::jni::mixed_join_gather_single_map(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_semi_join(left_keys, right_keys, left_condition, right_condition,
-                                          condition, nulls_equal);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [](cudf::table_view const& left_keys,
+       cudf::table_view const& right_keys,
+       cudf::table_view const& left_condition,
+       cudf::table_view const& right_condition,
+       cudf::ast::expression const& condition,
+       cudf::null_equality nulls_equal) {
+      return cudf::mixed_left_semi_join(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal);
+    });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftAntiJoinGatherMap(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  JNIEnv* env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal)
+{
   return cudf::jni::join_gather_single_map(
-      env, j_left_keys, j_right_keys, compare_nulls_equal,
-      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
-        return cudf::left_anti_join(left, right, nulleq);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    compare_nulls_equal,
+    [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
+      return cudf::left_anti_join(left, right, nulleq);
+    });
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinRowCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition) {
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
   JNI_NULL_CHECK(env, j_left_table, "left_table is null", 0);
   JNI_NULL_CHECK(env, j_right_table, "right_table is null", 0);
   JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
-    auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
-    auto row_count = cudf::conditional_left_anti_join_size(*left_table, *right_table,
-                                                           condition->get_top_expression());
+    auto left_table  = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto right_table = reinterpret_cast<cudf::table_view const*>(j_right_table);
+    auto condition   = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_condition);
+    auto row_count   = cudf::conditional_left_anti_join_size(
+      *left_table, *right_table, condition->get_top_expression());
     return static_cast<jlong>(row_count);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinGatherMap(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition) {
-  return cudf::jni::cond_join_gather_single_map(
-      env, j_left_table, j_right_table, j_condition,
-      [](cudf::table_view const &left, cudf::table_view const &right,
-         cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_left_anti_join(left, right, cond_expr);
-      });
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
+  return cudf::jni::cond_join_gather_single_map(env,
+                                                j_left_table,
+                                                j_right_table,
+                                                j_condition,
+                                                [](cudf::table_view const& left,
+                                                   cudf::table_view const& right,
+                                                   cudf::ast::expression const& cond_expr) {
+                                                  return cudf::conditional_left_anti_join(
+                                                    left, right, cond_expr);
+                                                });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinGatherMapWithCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
-    jlong j_row_count) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinGatherMapWithCount(JNIEnv* env,
+                                                                    jclass,
+                                                                    jlong j_left_table,
+                                                                    jlong j_right_table,
+                                                                    jlong j_condition,
+                                                                    jlong j_row_count)
+{
   auto row_count = static_cast<std::size_t>(j_row_count);
   return cudf::jni::cond_join_gather_single_map(
-      env, j_left_table, j_right_table, j_condition,
-      [row_count](cudf::table_view const &left, cudf::table_view const &right,
-                  cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_left_anti_join(left, right, cond_expr, row_count);
-      });
+    env,
+    j_left_table,
+    j_right_table,
+    j_condition,
+    [row_count](cudf::table_view const& left,
+                cudf::table_view const& right,
+                cudf::ast::expression const& cond_expr) {
+      return cudf::conditional_left_anti_join(left, right, cond_expr, row_count);
+    });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftAntiJoinGatherMap(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_mixedLeftAntiJoinGatherMap(JNIEnv* env,
+                                                     jclass,
+                                                     jlong j_left_keys,
+                                                     jlong j_right_keys,
+                                                     jlong j_left_condition,
+                                                     jlong j_right_condition,
+                                                     jlong j_condition,
+                                                     jboolean j_nulls_equal)
+{
   return cudf::jni::mixed_join_gather_single_map(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_anti_join(left_keys, right_keys, left_condition, right_condition,
-                                          condition, nulls_equal);
-      });
-}
-
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_crossJoin(JNIEnv *env, jclass,
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [](cudf::table_view const& left_keys,
+       cudf::table_view const& right_keys,
+       cudf::table_view const& left_condition,
+       cudf::table_view const& right_condition,
+       cudf::ast::expression const& condition,
+       cudf::null_equality nulls_equal) {
+      return cudf::mixed_left_anti_join(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal);
+    });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_crossJoin(JNIEnv* env,
+                                                                 jclass,
                                                                  jlong left_table,
-                                                                 jlong right_table) {
+                                                                 jlong right_table)
+{
   JNI_NULL_CHECK(env, left_table, "left_table is null", NULL);
   JNI_NULL_CHECK(env, right_table, "right_table is null", NULL);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const left = reinterpret_cast<cudf::table_view const *>(left_table);
-    auto const right = reinterpret_cast<cudf::table_view const *>(right_table);
+    auto const left  = reinterpret_cast<cudf::table_view const*>(left_table);
+    auto const right = reinterpret_cast<cudf::table_view const*>(right_table);
     return convert_table_for_return(env, cudf::cross_join(*left, *right));
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_interleaveColumns(JNIEnv *env, jclass,
-                                                                    jlongArray j_cudf_table_view) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_interleaveColumns(JNIEnv* env,
+                                                                    jclass,
+                                                                    jlongArray j_cudf_table_view)
+{
   JNI_NULL_CHECK(env, j_cudf_table_view, "table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *table_view = reinterpret_cast<cudf::table_view *>(j_cudf_table_view);
+    cudf::table_view* table_view = reinterpret_cast<cudf::table_view*>(j_cudf_table_view);
     return release_as_jlong(cudf::interleave_columns(*table_view));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_concatenate(JNIEnv *env, jclass,
-                                                                   jlongArray table_handles) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_concatenate(JNIEnv* env,
+                                                                   jclass,
+                                                                   jlongArray table_handles)
+{
   JNI_NULL_CHECK(env, table_handles, "input tables are null", NULL);
   try {
     cudf::jni::auto_set_device(env);
@@ -2953,12 +3541,13 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_concatenate(JNIEnv *env,
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_partition(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_partition(JNIEnv* env,
+                                                                 jclass,
                                                                  jlong input_table,
                                                                  jlong partition_column,
                                                                  jint number_of_partitions,
-                                                                 jintArray output_offsets) {
-
+                                                                 jintArray output_offsets)
+{
   JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, partition_column, "partition_column is null", NULL);
   JNI_NULL_CHECK(env, output_offsets, "output_offsets is null", NULL);
@@ -2966,11 +3555,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_partition(JNIEnv *env, jc
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const n_input_table = reinterpret_cast<cudf::table_view const *>(input_table);
-    auto const n_part_column = reinterpret_cast<cudf::column_view const *>(partition_column);
+    auto const n_input_table = reinterpret_cast<cudf::table_view const*>(input_table);
+    auto const n_part_column = reinterpret_cast<cudf::column_view const*>(partition_column);
 
     auto [partitioned_table, partition_offsets] =
-        cudf::partition(*n_input_table, *n_part_column, number_of_partitions);
+      cudf::partition(*n_input_table, *n_part_column, number_of_partitions);
 
     // for what ever reason partition returns the length of the result at then
     // end and hash partition/round robin do not, so skip the last entry for
@@ -2983,10 +3572,15 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_partition(JNIEnv *env, jc
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(
-    JNIEnv *env, jclass, jlong input_table, jintArray columns_to_hash, jint hash_function,
-    jint number_of_partitions, jint seed, jintArray output_offsets) {
-
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(JNIEnv* env,
+                                                                     jclass,
+                                                                     jlong input_table,
+                                                                     jintArray columns_to_hash,
+                                                                     jint hash_function,
+                                                                     jint number_of_partitions,
+                                                                     jint seed,
+                                                                     jintArray output_offsets)
+{
   JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, columns_to_hash, "columns_to_hash is null", NULL);
   JNI_NULL_CHECK(env, output_offsets, "output_offsets is null", NULL);
@@ -2994,9 +3588,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const hash_func = static_cast<cudf::hash_id>(hash_function);
-    auto const hash_seed = static_cast<uint32_t>(seed);
-    auto const n_input_table = reinterpret_cast<cudf::table_view const *>(input_table);
+    auto const hash_func     = static_cast<cudf::hash_id>(hash_function);
+    auto const hash_seed     = static_cast<uint32_t>(seed);
+    auto const n_input_table = reinterpret_cast<cudf::table_view const*>(input_table);
     cudf::jni::native_jintArray n_columns_to_hash(env, columns_to_hash);
     JNI_ARG_CHECK(env, n_columns_to_hash.size() > 0, "columns_to_hash is zero", NULL);
 
@@ -3004,7 +3598,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(
                                                      n_columns_to_hash.end());
 
     auto [partitioned_table, partition_offsets] = cudf::hash_partition(
-        *n_input_table, columns_to_hash_vec, number_of_partitions, hash_func, hash_seed);
+      *n_input_table, columns_to_hash_vec, number_of_partitions, hash_func, hash_seed);
 
     cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
     std::copy(partition_offsets.begin(), partition_offsets.end(), n_output_offsets.begin());
@@ -3014,9 +3608,13 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_roundRobinPartition(
-    JNIEnv *env, jclass, jlong input_table, jint num_partitions, jint start_partition,
-    jintArray output_offsets) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_roundRobinPartition(JNIEnv* env,
+                                                                           jclass,
+                                                                           jlong input_table,
+                                                                           jint num_partitions,
+                                                                           jint start_partition,
+                                                                           jintArray output_offsets)
+{
   JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, output_offsets, "output_offsets is null", NULL);
   JNI_ARG_CHECK(env, num_partitions > 0, "num_partitions <= 0", NULL);
@@ -3024,10 +3622,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_roundRobinPartition(
 
   try {
     cudf::jni::auto_set_device(env);
-    auto n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    auto n_input_table = reinterpret_cast<cudf::table_view*>(input_table);
 
     auto [partitioned_table, partition_offsets] =
-        cudf::round_robin_partition(*n_input_table, num_partitions, start_partition);
+      cudf::round_robin_partition(*n_input_table, num_partitions, start_partition);
 
     cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
     std::copy(partition_offsets.begin(), partition_offsets.end(), n_output_offsets.begin());
@@ -3037,10 +3635,18 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_roundRobinPartition(
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate(
-    JNIEnv *env, jclass, jlong input_table, jintArray keys, jintArray aggregate_column_indices,
-    jlongArray agg_instances, jboolean ignore_null_keys, jboolean jkey_sorted,
-    jbooleanArray jkeys_sort_desc, jbooleanArray jkeys_null_first) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_groupByAggregate(JNIEnv* env,
+                                           jclass,
+                                           jlong input_table,
+                                           jintArray keys,
+                                           jintArray aggregate_column_indices,
+                                           jlongArray agg_instances,
+                                           jboolean ignore_null_keys,
+                                           jboolean jkey_sorted,
+                                           jbooleanArray jkeys_sort_desc,
+                                           jbooleanArray jkeys_null_first)
+{
   JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, keys, "input keys are null", NULL);
   JNI_NULL_CHECK(env, aggregate_column_indices, "input aggregate_column_indices are null", NULL);
@@ -3048,7 +3654,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate(
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    cudf::table_view* n_input_table = reinterpret_cast<cudf::table_view*>(input_table);
     cudf::jni::native_jintArray n_keys(env, keys);
     cudf::jni::native_jintArray n_values(env, aggregate_column_indices);
     cudf::jni::native_jpointerArray<cudf::aggregation> n_agg_instances(env, agg_instances);
@@ -3059,11 +3665,14 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate(
     }
 
     cudf::table_view n_keys_table(n_keys_cols);
-    auto column_order = cudf::jni::resolve_column_order(env, jkeys_sort_desc, n_keys.size());
+    auto column_order    = cudf::jni::resolve_column_order(env, jkeys_sort_desc, n_keys.size());
     auto null_precedence = cudf::jni::resolve_null_precedence(env, jkeys_null_first, n_keys.size());
     cudf::groupby::groupby grouper(
-        n_keys_table, ignore_null_keys ? cudf::null_policy::EXCLUDE : cudf::null_policy::INCLUDE,
-        jkey_sorted ? cudf::sorted::YES : cudf::sorted::NO, column_order, null_precedence);
+      n_keys_table,
+      ignore_null_keys ? cudf::null_policy::EXCLUDE : cudf::null_policy::INCLUDE,
+      jkey_sorted ? cudf::sorted::YES : cudf::sorted::NO,
+      column_order,
+      null_precedence);
 
     // Aggregates are passed in already grouped by column, so we just need to fill it in
     // as we go.
@@ -3074,12 +3683,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate(
       cudf::groupby::aggregation_request req;
       int col_index = n_values[i];
 
-      cudf::groupby_aggregation *agg =
-          dynamic_cast<cudf::groupby_aggregation *>(n_agg_instances[i]);
-      JNI_ARG_CHECK(env, agg != nullptr, "aggregation is not an instance of groupby_aggregation",
-                    nullptr);
+      cudf::groupby_aggregation* agg = dynamic_cast<cudf::groupby_aggregation*>(n_agg_instances[i]);
+      JNI_ARG_CHECK(
+        env, agg != nullptr, "aggregation is not an instance of groupby_aggregation", nullptr);
       std::unique_ptr<cudf::groupby_aggregation> cloned(
-          dynamic_cast<cudf::groupby_aggregation *>(agg->clone().release()));
+        dynamic_cast<cudf::groupby_aggregation*>(agg->clone().release()));
 
       if (col_index == previous_index) {
         requests.back().aggregations.push_back(std::move(cloned));
@@ -3092,7 +3700,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate(
     }
 
     std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::groupby::aggregation_result>> result =
-        grouper.aggregate(requests);
+      grouper.aggregate(requests);
 
     std::vector<std::unique_ptr<cudf::column>> result_columns;
     int agg_result_size = result.second.size();
@@ -3107,10 +3715,18 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate(
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByScan(
-    JNIEnv *env, jclass, jlong input_table, jintArray keys, jintArray aggregate_column_indices,
-    jlongArray agg_instances, jboolean ignore_null_keys, jboolean jkey_sorted,
-    jbooleanArray jkeys_sort_desc, jbooleanArray jkeys_null_first) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_groupByScan(JNIEnv* env,
+                                      jclass,
+                                      jlong input_table,
+                                      jintArray keys,
+                                      jintArray aggregate_column_indices,
+                                      jlongArray agg_instances,
+                                      jboolean ignore_null_keys,
+                                      jboolean jkey_sorted,
+                                      jbooleanArray jkeys_sort_desc,
+                                      jbooleanArray jkeys_null_first)
+{
   JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, keys, "input keys are null", NULL);
   JNI_NULL_CHECK(env, aggregate_column_indices, "input aggregate_column_indices are null", NULL);
@@ -3118,7 +3734,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByScan(
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    cudf::table_view* n_input_table = reinterpret_cast<cudf::table_view*>(input_table);
     cudf::jni::native_jintArray n_keys(env, keys);
     cudf::jni::native_jintArray n_values(env, aggregate_column_indices);
     cudf::jni::native_jpointerArray<cudf::aggregation> n_agg_instances(env, agg_instances);
@@ -3129,11 +3745,14 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByScan(
     }
 
     cudf::table_view n_keys_table(n_keys_cols);
-    auto column_order = cudf::jni::resolve_column_order(env, jkeys_sort_desc, n_keys.size());
+    auto column_order    = cudf::jni::resolve_column_order(env, jkeys_sort_desc, n_keys.size());
     auto null_precedence = cudf::jni::resolve_null_precedence(env, jkeys_null_first, n_keys.size());
     cudf::groupby::groupby grouper(
-        n_keys_table, ignore_null_keys ? cudf::null_policy::EXCLUDE : cudf::null_policy::INCLUDE,
-        jkey_sorted ? cudf::sorted::YES : cudf::sorted::NO, column_order, null_precedence);
+      n_keys_table,
+      ignore_null_keys ? cudf::null_policy::EXCLUDE : cudf::null_policy::INCLUDE,
+      jkey_sorted ? cudf::sorted::YES : cudf::sorted::NO,
+      column_order,
+      null_precedence);
 
     // Aggregates are passed in already grouped by column, so we just need to fill it in
     // as we go.
@@ -3144,12 +3763,12 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByScan(
       cudf::groupby::scan_request req;
       int col_index = n_values[i];
 
-      cudf::groupby_scan_aggregation *agg =
-          dynamic_cast<cudf::groupby_scan_aggregation *>(n_agg_instances[i]);
-      JNI_ARG_CHECK(env, agg != nullptr,
-                    "aggregation is not an instance of groupby_scan_aggregation", nullptr);
+      cudf::groupby_scan_aggregation* agg =
+        dynamic_cast<cudf::groupby_scan_aggregation*>(n_agg_instances[i]);
+      JNI_ARG_CHECK(
+        env, agg != nullptr, "aggregation is not an instance of groupby_scan_aggregation", nullptr);
       std::unique_ptr<cudf::groupby_scan_aggregation> cloned(
-          dynamic_cast<cudf::groupby_scan_aggregation *>(agg->clone().release()));
+        dynamic_cast<cudf::groupby_scan_aggregation*>(agg->clone().release()));
 
       if (col_index == previous_index) {
         requests.back().aggregations.push_back(std::move(cloned));
@@ -3162,7 +3781,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByScan(
     }
 
     std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::groupby::aggregation_result>> result =
-        grouper.scan(requests);
+      grouper.scan(requests);
 
     std::vector<std::unique_ptr<cudf::column>> result_columns;
     int agg_result_size = result.second.size();
@@ -3177,10 +3796,18 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByScan(
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByReplaceNulls(
-    JNIEnv *env, jclass, jlong input_table, jintArray keys, jintArray replace_column_indices,
-    jbooleanArray is_preceding, jboolean ignore_null_keys, jboolean jkey_sorted,
-    jbooleanArray jkeys_sort_desc, jbooleanArray jkeys_null_first) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_groupByReplaceNulls(JNIEnv* env,
+                                              jclass,
+                                              jlong input_table,
+                                              jintArray keys,
+                                              jintArray replace_column_indices,
+                                              jbooleanArray is_preceding,
+                                              jboolean ignore_null_keys,
+                                              jboolean jkey_sorted,
+                                              jbooleanArray jkeys_sort_desc,
+                                              jbooleanArray jkeys_null_first)
+{
   JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, keys, "input keys are null", NULL);
   JNI_NULL_CHECK(env, replace_column_indices, "input replace_column_indices are null", NULL);
@@ -3188,7 +3815,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByReplaceNulls(
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    cudf::table_view* n_input_table = reinterpret_cast<cudf::table_view*>(input_table);
     cudf::jni::native_jintArray n_keys(env, keys);
     cudf::jni::native_jintArray n_values(env, replace_column_indices);
     cudf::jni::native_jbooleanArray n_is_preceding(env, is_preceding);
@@ -3199,11 +3826,14 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByReplaceNulls(
     }
 
     cudf::table_view n_keys_table(n_keys_cols);
-    auto column_order = cudf::jni::resolve_column_order(env, jkeys_sort_desc, n_keys.size());
+    auto column_order    = cudf::jni::resolve_column_order(env, jkeys_sort_desc, n_keys.size());
     auto null_precedence = cudf::jni::resolve_null_precedence(env, jkeys_null_first, n_keys.size());
     cudf::groupby::groupby grouper(
-        n_keys_table, ignore_null_keys ? cudf::null_policy::EXCLUDE : cudf::null_policy::INCLUDE,
-        jkey_sorted ? cudf::sorted::YES : cudf::sorted::NO, column_order, null_precedence);
+      n_keys_table,
+      ignore_null_keys ? cudf::null_policy::EXCLUDE : cudf::null_policy::INCLUDE,
+      jkey_sorted ? cudf::sorted::YES : cudf::sorted::NO,
+      column_order,
+      null_precedence);
 
     // Aggregates are passed in already grouped by column, so we just need to fill it in
     // as we go.
@@ -3215,7 +3845,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByReplaceNulls(
     cudf::table_view n_replace_table(n_replace_cols);
 
     std::vector<cudf::replace_policy> policies = n_is_preceding.transform_if_else(
-        cudf::replace_policy::PRECEDING, cudf::replace_policy::FOLLOWING);
+      cudf::replace_policy::PRECEDING, cudf::replace_policy::FOLLOWING);
 
     auto [keys, results] = grouper.replace_nulls(n_replace_table, policies);
     return convert_table_for_return(env, keys, results);
@@ -3223,48 +3853,51 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByReplaceNulls(
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_filter(JNIEnv *env, jclass,
-                                                              jlong input_jtable, jlong mask_jcol) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_filter(JNIEnv* env,
+                                                              jclass,
+                                                              jlong input_jtable,
+                                                              jlong mask_jcol)
+{
   JNI_NULL_CHECK(env, input_jtable, "input table is null", 0);
   JNI_NULL_CHECK(env, mask_jcol, "mask column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
-    auto const mask = reinterpret_cast<cudf::column_view const *>(mask_jcol);
+    auto const input = reinterpret_cast<cudf::table_view const*>(input_jtable);
+    auto const mask  = reinterpret_cast<cudf::column_view const*>(mask_jcol);
     return convert_table_for_return(env, cudf::apply_boolean_mask(*input, *mask));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Table_distinctCount(JNIEnv *env, jclass,
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Table_distinctCount(JNIEnv* env,
+                                                               jclass,
                                                                jlong input_jtable,
-                                                               jboolean nulls_equal) {
+                                                               jboolean nulls_equal)
+{
   JNI_NULL_CHECK(env, input_jtable, "input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
+    auto const input = reinterpret_cast<cudf::table_view const*>(input_jtable);
 
-    return cudf::distinct_count(*input, nulls_equal ? cudf::null_equality::EQUAL :
-                                                      cudf::null_equality::UNEQUAL);
+    return cudf::distinct_count(
+      *input, nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates(JNIEnv *env, jclass,
-                                                                      jlong input_jtable,
-                                                                      jintArray key_columns,
-                                                                      jint keep,
-                                                                      jboolean nulls_equal) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates(
+  JNIEnv* env, jclass, jlong input_jtable, jintArray key_columns, jint keep, jboolean nulls_equal)
+{
   JNI_NULL_CHECK(env, input_jtable, "input table is null", 0);
   JNI_NULL_CHECK(env, key_columns, "input key_columns is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
+    auto const input = reinterpret_cast<cudf::table_view const*>(input_jtable);
 
     static_assert(sizeof(jint) == sizeof(cudf::size_type), "Integer types mismatched.");
     auto const native_keys_indices = cudf::jni::native_jintArray(env, key_columns);
     auto const keys_indices =
-        std::vector<cudf::size_type>(native_keys_indices.begin(), native_keys_indices.end());
+      std::vector<cudf::size_type>(native_keys_indices.begin(), native_keys_indices.end());
     auto const keep_option = [&] {
       switch (keep) {
         case 0: return cudf::duplicate_keep_option::KEEP_ANY;
@@ -3272,54 +3905,60 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates(JNIEnv *en
         case 2: return cudf::duplicate_keep_option::KEEP_LAST;
         case 3: return cudf::duplicate_keep_option::KEEP_NONE;
         default:
-          JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Invalid `keep` option",
+          JNI_THROW_NEW(env,
+                        cudf::jni::ILLEGAL_ARG_CLASS,
+                        "Invalid `keep` option",
                         cudf::duplicate_keep_option::KEEP_ANY);
       }
     }();
 
     auto result =
-        cudf::distinct(*input, keys_indices, keep_option,
-                       nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL,
-                       cudf::nan_equality::ALL_EQUAL, rmm::mr::get_current_device_resource());
+      cudf::distinct(*input,
+                     keys_indices,
+                     keep_option,
+                     nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL,
+                     cudf::nan_equality::ALL_EQUAL,
+                     rmm::mr::get_current_device_resource());
     return convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclass, jlong j_input,
-                                                              jlong j_map, jboolean check_bounds) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(
+  JNIEnv* env, jclass, jlong j_input, jlong j_map, jboolean check_bounds)
+{
   JNI_NULL_CHECK(env, j_input, "input table is null", 0);
   JNI_NULL_CHECK(env, j_map, "map column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::table_view const *>(j_input);
-    auto const map = reinterpret_cast<cudf::column_view const *>(j_map);
+    auto const input = reinterpret_cast<cudf::table_view const*>(j_input);
+    auto const map   = reinterpret_cast<cudf::column_view const*>(j_map);
     auto bounds_policy =
-        check_bounds ? cudf::out_of_bounds_policy::NULLIFY : cudf::out_of_bounds_policy::DONT_CHECK;
+      check_bounds ? cudf::out_of_bounds_policy::NULLIFY : cudf::out_of_bounds_policy::DONT_CHECK;
     return convert_table_for_return(env, cudf::gather(*input, *map, bounds_policy));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterTable(JNIEnv *env, jclass,
-                                                                    jlong j_input, jlong j_map,
-                                                                    jlong j_target) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterTable(
+  JNIEnv* env, jclass, jlong j_input, jlong j_map, jlong j_target)
+{
   JNI_NULL_CHECK(env, j_input, "input table is null", 0);
   JNI_NULL_CHECK(env, j_map, "map column is null", 0);
   JNI_NULL_CHECK(env, j_target, "target table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::table_view const *>(j_input);
-    auto const map = reinterpret_cast<cudf::column_view const *>(j_map);
-    auto const target = reinterpret_cast<cudf::table_view const *>(j_target);
+    auto const input  = reinterpret_cast<cudf::table_view const*>(j_input);
+    auto const map    = reinterpret_cast<cudf::column_view const*>(j_map);
+    auto const target = reinterpret_cast<cudf::table_view const*>(j_target);
     return convert_table_for_return(env, cudf::scatter(*input, *map, *target));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterScalars(JNIEnv *env, jclass,
-                                                                      jlongArray j_input,
-                                                                      jlong j_map, jlong j_target) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterScalars(
+  JNIEnv* env, jclass, jlongArray j_input, jlong j_map, jlong j_target)
+{
   JNI_NULL_CHECK(env, j_input, "input scalars array is null", 0);
   JNI_NULL_CHECK(env, j_map, "map column is null", 0);
   JNI_NULL_CHECK(env, j_target, "target table is null", 0);
@@ -3327,81 +3966,94 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterScalars(JNIEnv *en
     cudf::jni::auto_set_device(env);
     auto const scalars_array = cudf::jni::native_jpointerArray<cudf::scalar>(env, j_input);
     std::vector<std::reference_wrapper<cudf::scalar const>> input;
-    std::transform(scalars_array.begin(), scalars_array.end(), std::back_inserter(input),
-                   [](auto &scalar) { return std::ref(*scalar); });
-    auto const map = reinterpret_cast<cudf::column_view const *>(j_map);
-    auto const target = reinterpret_cast<cudf::table_view const *>(j_target);
+    std::transform(
+      scalars_array.begin(), scalars_array.end(), std::back_inserter(input), [](auto& scalar) {
+        return std::ref(*scalar);
+      });
+    auto const map    = reinterpret_cast<cudf::column_view const*>(j_map);
+    auto const target = reinterpret_cast<cudf::table_view const*>(j_target);
     return convert_table_for_return(env, cudf::scatter(input, *map, *target));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_repeatStaticCount(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_repeatStaticCount(JNIEnv* env,
+                                                                         jclass,
                                                                          jlong input_jtable,
-                                                                         jint count) {
+                                                                         jint count)
+{
   JNI_NULL_CHECK(env, input_jtable, "input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
+    auto const input = reinterpret_cast<cudf::table_view const*>(input_jtable);
     return convert_table_for_return(env, cudf::repeat(*input, count));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_repeatColumnCount(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_repeatColumnCount(JNIEnv* env,
+                                                                         jclass,
                                                                          jlong input_jtable,
-                                                                         jlong count_jcol) {
+                                                                         jlong count_jcol)
+{
   JNI_NULL_CHECK(env, input_jtable, "input table is null", 0);
   JNI_NULL_CHECK(env, count_jcol, "count column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
-    auto const count = reinterpret_cast<cudf::column_view const *>(count_jcol);
+    auto const input = reinterpret_cast<cudf::table_view const*>(input_jtable);
+    auto const count = reinterpret_cast<cudf::column_view const*>(count_jcol);
     return convert_table_for_return(env, cudf::repeat(*input, *count));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_bound(JNIEnv *env, jclass, jlong input_jtable,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_bound(JNIEnv* env,
+                                                        jclass,
+                                                        jlong input_jtable,
                                                         jlong values_jtable,
                                                         jbooleanArray desc_flags,
                                                         jbooleanArray are_nulls_smallest,
-                                                        jboolean is_upper_bound) {
+                                                        jboolean is_upper_bound)
+{
   JNI_NULL_CHECK(env, input_jtable, "input table is null", 0);
   JNI_NULL_CHECK(env, values_jtable, "values table is null", 0);
   using cudf::column;
   using cudf::table_view;
   try {
     cudf::jni::auto_set_device(env);
-    table_view *input = reinterpret_cast<table_view *>(input_jtable);
-    table_view *values = reinterpret_cast<table_view *>(values_jtable);
+    table_view* input  = reinterpret_cast<table_view*>(input_jtable);
+    table_view* values = reinterpret_cast<table_view*>(values_jtable);
     cudf::jni::native_jbooleanArray const n_desc_flags(env, desc_flags);
     cudf::jni::native_jbooleanArray const n_are_nulls_smallest(env, are_nulls_smallest);
 
     std::vector<cudf::order> column_desc_flags{
-        n_desc_flags.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING)};
+      n_desc_flags.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING)};
     std::vector<cudf::null_order> column_null_orders{
-        n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER)};
+      n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER)};
 
-    JNI_ARG_CHECK(env, (column_desc_flags.size() == column_null_orders.size()),
-                  "null-order and sort-order size mismatch", 0);
+    JNI_ARG_CHECK(env,
+                  (column_desc_flags.size() == column_null_orders.size()),
+                  "null-order and sort-order size mismatch",
+                  0);
 
     return release_as_jlong(
-        is_upper_bound ? cudf::upper_bound(*input, *values, column_desc_flags, column_null_orders) :
-                         cudf::lower_bound(*input, *values, column_desc_flags, column_null_orders));
+      is_upper_bound ? cudf::upper_bound(*input, *values, column_desc_flags, column_null_orders)
+                     : cudf::lower_bound(*input, *values, column_desc_flags, column_null_orders));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_Table_contiguousSplit(JNIEnv *env, jclass,
+JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_Table_contiguousSplit(JNIEnv* env,
+                                                                         jclass,
                                                                          jlong input_table,
-                                                                         jintArray split_indices) {
+                                                                         jintArray split_indices)
+{
   JNI_NULL_CHECK(env, input_table, "native handle is null", 0);
   JNI_NULL_CHECK(env, split_indices, "split indices are null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_table = reinterpret_cast<cudf::table_view *>(input_table);
+    cudf::table_view* n_table = reinterpret_cast<cudf::table_view*>(input_table);
     cudf::jni::native_jintArray n_split_indices(env, split_indices);
 
     std::vector<cudf::size_type> indices(n_split_indices.data(),
@@ -3409,42 +4061,50 @@ JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_Table_contiguousSplit(JNIEnv
 
     std::vector<cudf::packed_table> result = cudf::contiguous_split(*n_table, indices);
     cudf::jni::native_jobjectArray<jobject> n_result =
-        cudf::jni::contiguous_table_array(env, result.size());
+      cudf::jni::contiguous_table_array(env, result.size());
     for (size_t i = 0; i < result.size(); i++) {
       n_result.set(
-          i, cudf::jni::contiguous_table_from(env, result[i].data, result[i].table.num_rows()));
+        i, cudf::jni::contiguous_table_from(env, result[i].data, result[i].table.num_rows()));
     }
     return n_result.wrapped();
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_makeChunkedPack(JNIEnv *env, jclass,
-                                                                  jlong input_table,
-                                                                  jlong bounce_buffer_size,
-                                                                  jlong memoryResourceHandle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_makeChunkedPack(
+  JNIEnv* env, jclass, jlong input_table, jlong bounce_buffer_size, jlong memoryResourceHandle)
+{
   JNI_NULL_CHECK(env, input_table, "native handle is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_table = reinterpret_cast<cudf::table_view *>(input_table);
+    cudf::table_view* n_table = reinterpret_cast<cudf::table_view*>(input_table);
     // `temp_mr` is the memory resource that `cudf::chunked_pack` will use to create temporary
     // and scratch memory only.
-    auto temp_mr = memoryResourceHandle != 0 ?
-                       reinterpret_cast<rmm::mr::device_memory_resource *>(memoryResourceHandle) :
-                       rmm::mr::get_current_device_resource();
+    auto temp_mr      = memoryResourceHandle != 0
+                          ? reinterpret_cast<rmm::mr::device_memory_resource*>(memoryResourceHandle)
+                          : rmm::mr::get_current_device_resource();
     auto chunked_pack = cudf::chunked_pack::create(*n_table, bounce_buffer_size, temp_mr);
     return reinterpret_cast<jlong>(chunked_pack.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rollingWindowAggregate(
-    JNIEnv *env, jclass, jlong j_input_table, jintArray j_keys, jlongArray j_default_output,
-    jintArray j_aggregate_column_indices, jlongArray j_agg_instances, jintArray j_min_periods,
-    jintArray j_preceding, jintArray j_following, jbooleanArray j_unbounded_preceding,
-    jbooleanArray j_unbounded_following, jboolean ignore_null_keys) {
-
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_rollingWindowAggregate(JNIEnv* env,
+                                                 jclass,
+                                                 jlong j_input_table,
+                                                 jintArray j_keys,
+                                                 jlongArray j_default_output,
+                                                 jintArray j_aggregate_column_indices,
+                                                 jlongArray j_agg_instances,
+                                                 jintArray j_min_periods,
+                                                 jintArray j_preceding,
+                                                 jintArray j_following,
+                                                 jbooleanArray j_unbounded_preceding,
+                                                 jbooleanArray j_unbounded_following,
+                                                 jboolean ignore_null_keys)
+{
   JNI_NULL_CHECK(env, j_input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, j_keys, "input keys are null", NULL);
   JNI_NULL_CHECK(env, j_aggregate_column_indices, "input aggregate_column_indices are null", NULL);
@@ -3457,7 +4117,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rollingWindowAggregate(
     using cudf::jni::valid_window_parameters;
 
     // Convert from j-types to native.
-    cudf::table_view *input_table{reinterpret_cast<cudf::table_view *>(j_input_table)};
+    cudf::table_view* input_table{reinterpret_cast<cudf::table_view*>(j_input_table)};
     cudf::jni::native_jintArray keys{env, j_keys};
     cudf::jni::native_jintArray values{env, j_aggregate_column_indices};
     cudf::jni::native_jpointerArray<cudf::aggregation> agg_instances(env, j_agg_instances);
@@ -3469,37 +4129,47 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rollingWindowAggregate(
     cudf::jni::native_jbooleanArray unbounded_following{env, j_unbounded_following};
 
     if (not valid_window_parameters(values, agg_instances, min_periods, preceding, following)) {
-      JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
+      JNI_THROW_NEW(env,
+                    cudf::jni::ILLEGAL_ARG_CLASS,
                     "Number of aggregation columns must match number of agg ops, and window-specs",
                     nullptr);
     }
 
     // Extract table-view.
     cudf::table_view groupby_keys{
-        input_table->select(std::vector<cudf::size_type>(keys.data(), keys.data() + keys.size()))};
+      input_table->select(std::vector<cudf::size_type>(keys.data(), keys.data() + keys.size()))};
 
     std::vector<std::unique_ptr<cudf::column>> result_columns;
     for (int i(0); i < values.size(); ++i) {
-      cudf::rolling_aggregation *agg = dynamic_cast<cudf::rolling_aggregation *>(agg_instances[i]);
-      JNI_ARG_CHECK(env, agg != nullptr, "aggregation is not an instance of rolling_aggregation",
-                    nullptr);
-
-      int agg_column_index = values[i];
-      auto const preceding_window_bounds = unbounded_preceding[i] ?
-                                               cudf::window_bounds::unbounded() :
-                                               cudf::window_bounds::get(preceding[i]);
-      auto const following_window_bounds = unbounded_following[i] ?
-                                               cudf::window_bounds::unbounded() :
-                                               cudf::window_bounds::get(following[i]);
+      cudf::rolling_aggregation* agg = dynamic_cast<cudf::rolling_aggregation*>(agg_instances[i]);
+      JNI_ARG_CHECK(
+        env, agg != nullptr, "aggregation is not an instance of rolling_aggregation", nullptr);
+
+      int agg_column_index               = values[i];
+      auto const preceding_window_bounds = unbounded_preceding[i]
+                                             ? cudf::window_bounds::unbounded()
+                                             : cudf::window_bounds::get(preceding[i]);
+      auto const following_window_bounds = unbounded_following[i]
+                                             ? cudf::window_bounds::unbounded()
+                                             : cudf::window_bounds::get(following[i]);
 
       if (default_output[i] != nullptr) {
-        result_columns.emplace_back(cudf::grouped_rolling_window(
-            groupby_keys, input_table->column(agg_column_index), *default_output[i],
-            preceding_window_bounds, following_window_bounds, min_periods[i], *agg));
+        result_columns.emplace_back(
+          cudf::grouped_rolling_window(groupby_keys,
+                                       input_table->column(agg_column_index),
+                                       *default_output[i],
+                                       preceding_window_bounds,
+                                       following_window_bounds,
+                                       min_periods[i],
+                                       *agg));
       } else {
-        result_columns.emplace_back(cudf::grouped_rolling_window(
-            groupby_keys, input_table->column(agg_column_index), preceding_window_bounds,
-            following_window_bounds, min_periods[i], *agg));
+        result_columns.emplace_back(
+          cudf::grouped_rolling_window(groupby_keys,
+                                       input_table->column(agg_column_index),
+                                       preceding_window_bounds,
+                                       following_window_bounds,
+                                       min_periods[i],
+                                       *agg));
       }
     }
 
@@ -3509,13 +4179,22 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rollingWindowAggregate(
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggregate(
-    JNIEnv *env, jclass, jlong j_input_table, jintArray j_keys, jintArray j_orderby_column_indices,
-    jbooleanArray j_is_orderby_ascending, jintArray j_aggregate_column_indices,
-    jlongArray j_agg_instances, jintArray j_min_periods, jlongArray j_preceding,
-    jlongArray j_following, jintArray j_preceding_extent, jintArray j_following_extent,
-    jboolean ignore_null_keys) {
-
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_rangeRollingWindowAggregate(JNIEnv* env,
+                                                      jclass,
+                                                      jlong j_input_table,
+                                                      jintArray j_keys,
+                                                      jintArray j_orderby_column_indices,
+                                                      jbooleanArray j_is_orderby_ascending,
+                                                      jintArray j_aggregate_column_indices,
+                                                      jlongArray j_agg_instances,
+                                                      jintArray j_min_periods,
+                                                      jlongArray j_preceding,
+                                                      jlongArray j_following,
+                                                      jintArray j_preceding_extent,
+                                                      jintArray j_following_extent,
+                                                      jboolean ignore_null_keys)
+{
   JNI_NULL_CHECK(env, j_input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, j_keys, "input keys are null", NULL);
   JNI_NULL_CHECK(env, j_orderby_column_indices, "input orderby_column_indices are null", NULL);
@@ -3531,7 +4210,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggrega
     using cudf::jni::valid_window_parameters;
 
     // Convert from j-types to native.
-    cudf::table_view *input_table{reinterpret_cast<cudf::table_view *>(j_input_table)};
+    cudf::table_view* input_table{reinterpret_cast<cudf::table_view*>(j_input_table)};
     cudf::jni::native_jintArray keys{env, j_keys};
     cudf::jni::native_jintArray orderbys{env, j_orderby_column_indices};
     cudf::jni::native_jbooleanArray orderbys_ascending{env, j_is_orderby_ascending};
@@ -3544,21 +4223,22 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggrega
     cudf::jni::native_jpointerArray<cudf::scalar> following(env, j_following);
 
     if (not valid_window_parameters(values, agg_instances, min_periods, preceding, following)) {
-      JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
+      JNI_THROW_NEW(env,
+                    cudf::jni::ILLEGAL_ARG_CLASS,
                     "Number of aggregation columns must match number of agg ops, and window-specs",
                     nullptr);
     }
 
     // Extract table-view.
     cudf::table_view groupby_keys{
-        input_table->select(std::vector<cudf::size_type>(keys.data(), keys.data() + keys.size()))};
+      input_table->select(std::vector<cudf::size_type>(keys.data(), keys.data() + keys.size()))};
 
     std::vector<std::unique_ptr<cudf::column>> result_columns;
     for (int i(0); i < values.size(); ++i) {
-      int agg_column_index = values[i];
-      cudf::column_view const &order_by_column = input_table->column(orderbys[i]);
-      cudf::data_type order_by_type = order_by_column.type();
-      cudf::data_type duration_type = order_by_type;
+      int agg_column_index                     = values[i];
+      cudf::column_view const& order_by_column = input_table->column(orderbys[i]);
+      cudf::data_type order_by_type            = order_by_column.type();
+      cudf::data_type duration_type            = order_by_type;
 
       // Range extents are defined as:
       // a) 0 == CURRENT ROW
@@ -3566,8 +4246,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggrega
       // c) 2 == UNBOUNDED
       // Must set unbounded_type for only the BOUNDED case.
       auto constexpr CURRENT_ROW = 0;
-      auto constexpr BOUNDED = 1;
-      auto constexpr UNBOUNDED = 2;
+      auto constexpr BOUNDED     = 1;
+      auto constexpr UNBOUNDED   = 2;
       if (preceding_extent[i] != BOUNDED || following_extent[i] != BOUNDED) {
         switch (order_by_type.id()) {
           case cudf::type_id::TIMESTAMP_DAYS:
@@ -3589,11 +4269,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggrega
         }
       }
 
-      cudf::rolling_aggregation *agg = dynamic_cast<cudf::rolling_aggregation *>(agg_instances[i]);
-      JNI_ARG_CHECK(env, agg != nullptr, "aggregation is not an instance of rolling_aggregation",
-                    nullptr);
+      cudf::rolling_aggregation* agg = dynamic_cast<cudf::rolling_aggregation*>(agg_instances[i]);
+      JNI_ARG_CHECK(
+        env, agg != nullptr, "aggregation is not an instance of rolling_aggregation", nullptr);
 
-      auto const make_window_bounds = [&](auto const &range_extent, auto const *p_scalar) {
+      auto const make_window_bounds = [&](auto const& range_extent, auto const* p_scalar) {
         if (range_extent == CURRENT_ROW) {
           return cudf::range_window_bounds::current_row(duration_type);
         } else if (range_extent == UNBOUNDED) {
@@ -3604,11 +4284,14 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggrega
       };
 
       result_columns.emplace_back(cudf::grouped_range_rolling_window(
-          groupby_keys, order_by_column,
-          orderbys_ascending[i] ? cudf::order::ASCENDING : cudf::order::DESCENDING,
-          input_table->column(agg_column_index),
-          make_window_bounds(preceding_extent[i], preceding[i]),
-          make_window_bounds(following_extent[i], following[i]), min_periods[i], *agg));
+        groupby_keys,
+        order_by_column,
+        orderbys_ascending[i] ? cudf::order::ASCENDING : cudf::order::DESCENDING,
+        input_table->column(agg_column_index),
+        make_window_bounds(preceding_extent[i], preceding[i]),
+        make_window_bounds(following_extent[i], following[i]),
+        min_periods[i],
+        *agg));
     }
 
     auto result_table = std::make_unique<cudf::table>(std::move(result_columns));
@@ -3617,72 +4300,88 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggrega
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explode(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explode(JNIEnv* env,
+                                                               jclass,
                                                                jlong input_jtable,
-                                                               jint column_index) {
+                                                               jint column_index)
+{
   JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input_table = reinterpret_cast<cudf::table_view const *>(input_jtable);
-    auto const col_index = static_cast<cudf::size_type>(column_index);
+    auto const input_table = reinterpret_cast<cudf::table_view const*>(input_jtable);
+    auto const col_index   = static_cast<cudf::size_type>(column_index);
     return convert_table_for_return(env, cudf::explode(*input_table, col_index));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodePosition(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodePosition(JNIEnv* env,
+                                                                       jclass,
                                                                        jlong input_jtable,
-                                                                       jint column_index) {
+                                                                       jint column_index)
+{
   JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input_table = reinterpret_cast<cudf::table_view const *>(input_jtable);
-    auto const col_index = static_cast<cudf::size_type>(column_index);
+    auto const input_table = reinterpret_cast<cudf::table_view const*>(input_jtable);
+    auto const col_index   = static_cast<cudf::size_type>(column_index);
     return convert_table_for_return(env, cudf::explode_position(*input_table, col_index));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuter(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuter(JNIEnv* env,
+                                                                    jclass,
                                                                     jlong input_jtable,
-                                                                    jint column_index) {
+                                                                    jint column_index)
+{
   JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input_table = reinterpret_cast<cudf::table_view const *>(input_jtable);
-    auto const col_index = static_cast<cudf::size_type>(column_index);
+    auto const input_table = reinterpret_cast<cudf::table_view const*>(input_jtable);
+    auto const col_index   = static_cast<cudf::size_type>(column_index);
     return convert_table_for_return(env, cudf::explode_outer(*input_table, col_index));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuterPosition(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuterPosition(JNIEnv* env,
+                                                                            jclass,
                                                                             jlong input_jtable,
-                                                                            jint column_index) {
+                                                                            jint column_index)
+{
   JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input_table = reinterpret_cast<cudf::table_view const *>(input_jtable);
-    auto const col_index = static_cast<cudf::size_type>(column_index);
+    auto const input_table = reinterpret_cast<cudf::table_view const*>(input_jtable);
+    auto const col_index   = static_cast<cudf::size_type>(column_index);
     return convert_table_for_return(env, cudf::explode_outer_position(*input_table, col_index));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_rowBitCount(JNIEnv *env, jclass, jlong j_table) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_rowBitCount(JNIEnv* env, jclass, jlong j_table)
+{
   JNI_NULL_CHECK(env, j_table, "table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input_table = reinterpret_cast<cudf::table_view const *>(j_table);
+    auto const input_table = reinterpret_cast<cudf::table_view const*>(j_table);
     return release_as_jlong(cudf::row_bit_count(*input_table));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
-    JNIEnv *env, jclass, jlong jinput_table, jintArray jkey_indices, jboolean jignore_null_keys,
-    jboolean jkey_sorted, jbooleanArray jkeys_sort_desc, jbooleanArray jkeys_null_first,
-    jboolean genUniqKeys) {
+JNIEXPORT jobject JNICALL
+Java_ai_rapids_cudf_Table_contiguousSplitGroups(JNIEnv* env,
+                                                jclass,
+                                                jlong jinput_table,
+                                                jintArray jkey_indices,
+                                                jboolean jignore_null_keys,
+                                                jboolean jkey_sorted,
+                                                jbooleanArray jkeys_sort_desc,
+                                                jbooleanArray jkeys_null_first,
+                                                jboolean genUniqKeys)
+{
   JNI_NULL_CHECK(env, jinput_table, "table native handle is null", 0);
   JNI_NULL_CHECK(env, jkey_indices, "key indices are null", 0);
   // Two main steps to split the groups in the input table.
@@ -3693,7 +4392,7 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jintArray n_key_indices(env, jkey_indices);
-    auto const input_table = reinterpret_cast<cudf::table_view const *>(jinput_table);
+    auto const input_table = reinterpret_cast<cudf::table_view const*>(jinput_table);
 
     // Prepares arguments for the groupby:
     //   (keys, null_handling, keys_are_sorted, column_order, null_precedence)
@@ -3701,15 +4400,15 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
                                              n_key_indices.data() + n_key_indices.size());
     auto keys = input_table->select(key_indices);
     auto null_handling =
-        jignore_null_keys ? cudf::null_policy::EXCLUDE : cudf::null_policy::INCLUDE;
+      jignore_null_keys ? cudf::null_policy::EXCLUDE : cudf::null_policy::INCLUDE;
     auto keys_are_sorted = jkey_sorted ? cudf::sorted::YES : cudf::sorted::NO;
     auto column_order = cudf::jni::resolve_column_order(env, jkeys_sort_desc, key_indices.size());
     auto null_precedence =
-        cudf::jni::resolve_null_precedence(env, jkeys_null_first, key_indices.size());
+      cudf::jni::resolve_null_precedence(env, jkeys_null_first, key_indices.size());
 
     // Constructs a groupby
-    cudf::groupby::groupby grouper(keys, null_handling, keys_are_sorted, column_order,
-                                   null_precedence);
+    cudf::groupby::groupby grouper(
+      keys, null_handling, keys_are_sorted, column_order, null_precedence);
 
     // 1) Gets the groups(keys, offsets, values) from groupby.
     //
@@ -3736,14 +4435,14 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
     // original order of columns (same order with that in input table).
     std::vector<cudf::column_view> grouped_cols(key_indices.size() + num_value_cols);
     // key columns
-    auto key_view = groups.keys->view();
+    auto key_view    = groups.keys->view();
     auto key_view_it = key_view.begin();
     for (auto key_id : key_indices) {
       grouped_cols.at(key_id) = std::move(*key_view_it);
       key_view_it++;
     }
     // value columns
-    auto value_view = groups.values->view();
+    auto value_view    = groups.values->view();
     auto value_view_it = value_view.begin();
     for (auto value_id : value_indices) {
       grouped_cols.at(value_id) = std::move(*value_view_it);
@@ -3752,11 +4451,11 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
     cudf::table_view grouped_table(grouped_cols);
     // When no key columns, uses the input table instead, because the output
     // of 'get_groups' is empty.
-    auto &grouped_view = key_indices.empty() ? *input_table : grouped_table;
+    auto& grouped_view = key_indices.empty() ? *input_table : grouped_table;
 
     // Resolves the split indices from offsets vector directly to avoid copying. Since
     // the offsets vector may be very large if there are too many small groups.
-    std::vector<cudf::size_type> &split_indices = groups.offsets;
+    std::vector<cudf::size_type>& split_indices = groups.offsets;
     // Offsets layout is [0, split indices..., num_rows] or [0] for empty keys, so
     // need to removes the first and last elements. First remove last one.
     split_indices.pop_back();
@@ -3765,23 +4464,21 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
     std::unique_ptr<cudf::table> group_by_result_table;
     if (genUniqKeys) {
       // generate gather map column from `split_indices`
-      auto begin = std::cbegin(split_indices);
-      auto end = std::cend(split_indices);
+      auto begin      = std::cbegin(split_indices);
+      auto end        = std::cend(split_indices);
       auto const size = cudf::distance(begin, end);
-      auto const vec = thrust::host_vector<cudf::size_type>(begin, end);
-      auto buf = rmm::device_buffer{vec.data(), size * sizeof(cudf::size_type),
-                                    cudf::get_default_stream()};
+      auto const vec  = thrust::host_vector<cudf::size_type>(begin, end);
+      auto buf =
+        rmm::device_buffer{vec.data(), size * sizeof(cudf::size_type), cudf::get_default_stream()};
       auto gather_map_col = std::make_unique<cudf::column>(
-          cudf::data_type{cudf::type_id::INT32}, size, std::move(buf), rmm::device_buffer{}, 0);
+        cudf::data_type{cudf::type_id::INT32}, size, std::move(buf), rmm::device_buffer{}, 0);
 
       // gather the first key in each group to remove duplicated ones.
       group_by_result_table = cudf::gather(groups.keys->view(), gather_map_col->view());
     }
 
     // remove the first 0 if it exists
-    if (!split_indices.empty()) {
-      split_indices.erase(split_indices.begin());
-    }
+    if (!split_indices.empty()) { split_indices.erase(split_indices.begin()); }
 
     // 2) Splits the groups.
     std::vector<cudf::packed_table> result = cudf::contiguous_split(grouped_view, split_indices);
@@ -3791,10 +4488,10 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
 
     //  Returns the split result.
     cudf::jni::native_jobjectArray<jobject> n_result =
-        cudf::jni::contiguous_table_array(env, result.size());
+      cudf::jni::contiguous_table_array(env, result.size());
     for (size_t i = 0; i < result.size(); i++) {
       n_result.set(
-          i, cudf::jni::contiguous_table_from(env, result[i].data, result[i].table.num_rows()));
+        i, cudf::jni::contiguous_table_from(env, result[i].data, result[i].table.num_rows()));
     }
 
     jobjectArray groups_array = n_result.wrapped();
@@ -3809,17 +4506,17 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_sample(JNIEnv *env, jclass, jlong j_input,
-                                                              jlong n, jboolean replacement,
-                                                              jlong seed) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_sample(
+  JNIEnv* env, jclass, jlong j_input, jlong n, jboolean replacement, jlong seed)
+{
   JNI_NULL_CHECK(env, j_input, "input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::table_view const *>(j_input);
+    auto const input = reinterpret_cast<cudf::table_view const*>(j_input);
     auto sample_with_replacement =
-        replacement ? cudf::sample_with_replacement::TRUE : cudf::sample_with_replacement::FALSE;
+      replacement ? cudf::sample_with_replacement::TRUE : cudf::sample_with_replacement::FALSE;
     return convert_table_for_return(env, cudf::sample(*input, n, sample_with_replacement, seed));
   }
   CATCH_STD(env, 0);
 }
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/aggregation128_utils.cu b/java/src/main/native/src/aggregation128_utils.cu
index d722aaa84fe..a32e7d27085 100644
--- a/java/src/main/native/src/aggregation128_utils.cu
+++ b/java/src/main/native/src/aggregation128_utils.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,118 +14,131 @@
  * limitations under the License.
  */
 
-#include <cstddef>
-#include <utility>
-#include <vector>
+#include "aggregation128_utils.hpp"
 
-#include <cuda/functional>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/utilities/error.hpp>
+
 #include <rmm/exec_policy.hpp>
+
+#include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/permutation_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
-#include "aggregation128_utils.hpp"
+#include <cstddef>
+#include <utility>
+#include <vector>
 
 namespace {
 
 // Functor to reassemble a 128-bit value from four 64-bit chunks with overflow detection.
 class chunk_assembler : public thrust::unary_function<cudf::size_type, __int128_t> {
-public:
-  chunk_assembler(bool *overflows, uint64_t const *chunks0, uint64_t const *chunks1,
-                  uint64_t const *chunks2, int64_t const *chunks3)
-      : overflows(overflows), chunks0(chunks0), chunks1(chunks1), chunks2(chunks2),
-        chunks3(chunks3) {}
+ public:
+  chunk_assembler(bool* overflows,
+                  uint64_t const* chunks0,
+                  uint64_t const* chunks1,
+                  uint64_t const* chunks2,
+                  int64_t const* chunks3)
+    : overflows(overflows), chunks0(chunks0), chunks1(chunks1), chunks2(chunks2), chunks3(chunks3)
+  {
+  }
 
-  __device__ __int128_t operator()(cudf::size_type i) const {
+  __device__ __int128_t operator()(cudf::size_type i) const
+  {
     // Starting with the least significant input and moving to the most significant, propagate the
     // upper 32-bits of the previous column into the next column, i.e.: propagate the "carry" bits
     // of each 64-bit chunk into the next chunk.
-    uint64_t const c0 = chunks0[i];
-    uint64_t const c1 = chunks1[i] + (c0 >> 32);
-    uint64_t const c2 = chunks2[i] + (c1 >> 32);
-    int64_t const c3 = chunks3[i] + (c2 >> 32);
+    uint64_t const c0      = chunks0[i];
+    uint64_t const c1      = chunks1[i] + (c0 >> 32);
+    uint64_t const c2      = chunks2[i] + (c1 >> 32);
+    int64_t const c3       = chunks3[i] + (c2 >> 32);
     uint64_t const lower64 = (c1 << 32) | static_cast<uint32_t>(c0);
-    int64_t const upper64 = (c3 << 32) | static_cast<uint32_t>(c2);
+    int64_t const upper64  = (c3 << 32) | static_cast<uint32_t>(c2);
 
     // check for overflow by ensuring the sign bit matches the top carry bits
     int32_t const replicated_sign_bit = static_cast<int32_t>(c3) >> 31;
-    int32_t const top_carry_bits = static_cast<int32_t>(c3 >> 32);
-    overflows[i] = (replicated_sign_bit != top_carry_bits);
+    int32_t const top_carry_bits      = static_cast<int32_t>(c3 >> 32);
+    overflows[i]                      = (replicated_sign_bit != top_carry_bits);
 
     return (static_cast<__int128_t>(upper64) << 64) | lower64;
   }
 
-private:
+ private:
   // output column for overflow detected
-  bool *const overflows;
+  bool* const overflows;
 
   // input columns for the four 64-bit values
-  uint64_t const *const chunks0;
-  uint64_t const *const chunks1;
-  uint64_t const *const chunks2;
-  int64_t const *const chunks3;
+  uint64_t const* const chunks0;
+  uint64_t const* const chunks1;
+  uint64_t const* const chunks2;
+  int64_t const* const chunks3;
 };
 
-} // anonymous namespace
+}  // anonymous namespace
 
 namespace cudf::jni {
 
 // Extract a 32-bit chunk from a 128-bit value.
-std::unique_ptr<cudf::column> extract_chunk32(cudf::column_view const &in_col, cudf::data_type type,
-                                              int chunk_idx, rmm::cuda_stream_view stream) {
+std::unique_ptr<cudf::column> extract_chunk32(cudf::column_view const& in_col,
+                                              cudf::data_type type,
+                                              int chunk_idx,
+                                              rmm::cuda_stream_view stream)
+{
   CUDF_EXPECTS(in_col.type().id() == cudf::type_id::DECIMAL128, "not a 128-bit type");
   CUDF_EXPECTS(chunk_idx >= 0 && chunk_idx < 4, "invalid chunk index");
   CUDF_EXPECTS(type.id() == cudf::type_id::INT32 || type.id() == cudf::type_id::UINT32,
                "not a 32-bit integer type");
   auto const num_rows = in_col.size();
   auto out_col =
-      cudf::make_fixed_width_column(type, num_rows, copy_bitmask(in_col), in_col.null_count());
-  auto out_view = out_col->mutable_view();
+    cudf::make_fixed_width_column(type, num_rows, copy_bitmask(in_col), in_col.null_count());
+  auto out_view       = out_col->mutable_view();
   auto const in_begin = in_col.begin<int32_t>();
 
   // Build an iterator for every fourth 32-bit value, i.e.: one "chunk" of a __int128_t value
   thrust::transform_iterator transform_iter{
-      thrust::counting_iterator{0},
-      cuda::proclaim_return_type<cudf::size_type>([] __device__(auto i) { return i * 4; })};
+    thrust::counting_iterator{0},
+    cuda::proclaim_return_type<cudf::size_type>([] __device__(auto i) { return i * 4; })};
   thrust::permutation_iterator stride_iter{in_begin + chunk_idx, transform_iter};
 
-  thrust::copy(rmm::exec_policy(stream), stride_iter, stride_iter + num_rows,
-               out_view.data<int32_t>());
+  thrust::copy(
+    rmm::exec_policy(stream), stride_iter, stride_iter + num_rows, out_view.data<int32_t>());
   return out_col;
 }
 
 // Reassemble a column of 128-bit values from four 64-bit integer columns with overflow detection.
-std::unique_ptr<cudf::table> assemble128_from_sum(cudf::table_view const &chunks_table,
+std::unique_ptr<cudf::table> assemble128_from_sum(cudf::table_view const& chunks_table,
                                                   cudf::data_type output_type,
-                                                  rmm::cuda_stream_view stream) {
+                                                  rmm::cuda_stream_view stream)
+{
   CUDF_EXPECTS(output_type.id() == cudf::type_id::DECIMAL128, "not a 128-bit type");
   CUDF_EXPECTS(chunks_table.num_columns() == 4, "must be 4 column table");
   auto const num_rows = chunks_table.num_rows();
-  auto const chunks0 = chunks_table.column(0);
-  auto const chunks1 = chunks_table.column(1);
-  auto const chunks2 = chunks_table.column(2);
-  auto const chunks3 = chunks_table.column(3);
+  auto const chunks0  = chunks_table.column(0);
+  auto const chunks1  = chunks_table.column(1);
+  auto const chunks2  = chunks_table.column(2);
+  auto const chunks3  = chunks_table.column(3);
   CUDF_EXPECTS(cudf::size_of(chunks0.type()) == 8 && cudf::size_of(chunks1.type()) == 8 &&
-                   cudf::size_of(chunks2.type()) == 8 &&
-                   chunks3.type().id() == cudf::type_id::INT64,
+                 cudf::size_of(chunks2.type()) == 8 && chunks3.type().id() == cudf::type_id::INT64,
                "chunks type mismatch");
   std::vector<std::unique_ptr<cudf::column>> columns;
-  columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, num_rows,
-                                                  copy_bitmask(chunks0), chunks0.null_count()));
-  columns.push_back(cudf::make_fixed_width_column(output_type, num_rows, copy_bitmask(chunks0),
-                                                  chunks0.null_count()));
+  columns.push_back(cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::BOOL8}, num_rows, copy_bitmask(chunks0), chunks0.null_count()));
+  columns.push_back(cudf::make_fixed_width_column(
+    output_type, num_rows, copy_bitmask(chunks0), chunks0.null_count()));
   auto overflows_view = columns[0]->mutable_view();
   auto assembled_view = columns[1]->mutable_view();
-  thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator<cudf::size_type>(0),
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<cudf::size_type>(0),
                     thrust::make_counting_iterator<cudf::size_type>(num_rows),
                     assembled_view.begin<__int128_t>(),
-                    chunk_assembler(overflows_view.begin<bool>(), chunks0.begin<uint64_t>(),
-                                    chunks1.begin<uint64_t>(), chunks2.begin<uint64_t>(),
+                    chunk_assembler(overflows_view.begin<bool>(),
+                                    chunks0.begin<uint64_t>(),
+                                    chunks1.begin<uint64_t>(),
+                                    chunks2.begin<uint64_t>(),
                                     chunks3.begin<int64_t>()));
   return std::make_unique<cudf::table>(std::move(columns));
 }
 
-} // namespace cudf::jni
+}  // namespace cudf::jni
diff --git a/java/src/main/native/src/aggregation128_utils.hpp b/java/src/main/native/src/aggregation128_utils.hpp
index a1437606cdf..94860cea53b 100644
--- a/java/src/main/native/src/aggregation128_utils.hpp
+++ b/java/src/main/native/src/aggregation128_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,14 @@
  * limitations under the License.
  */
 
-#include <memory>
-
 #include <cudf/column/column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 
+#include <memory>
+
 namespace cudf::jni {
 
 /**
@@ -39,9 +40,11 @@ namespace cudf::jni {
  * @param stream    CUDA stream to use
  * @return          A column containing the extracted 32-bit integer values
  */
-std::unique_ptr<cudf::column>
-extract_chunk32(cudf::column_view const &col, cudf::data_type dtype, int chunk_idx,
-                rmm::cuda_stream_view stream = cudf::get_default_stream());
+std::unique_ptr<cudf::column> extract_chunk32(
+  cudf::column_view const& col,
+  cudf::data_type dtype,
+  int chunk_idx,
+  rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Reassemble a 128-bit column from four 64-bit integer columns with overflow detection.
@@ -63,8 +66,9 @@ extract_chunk32(cudf::column_view const &col, cudf::data_type dtype, int chunk_i
  *                     requested type. The boolean value will be true if an overflow was detected
  *                     for that row's value.
  */
-std::unique_ptr<cudf::table>
-assemble128_from_sum(cudf::table_view const &chunks_table, cudf::data_type output_type,
-                     rmm::cuda_stream_view stream = cudf::get_default_stream());
+std::unique_ptr<cudf::table> assemble128_from_sum(
+  cudf::table_view const& chunks_table,
+  cudf::data_type output_type,
+  rmm::cuda_stream_view stream = cudf::get_default_stream());
 
-} // namespace cudf::jni
+}  // namespace cudf::jni
diff --git a/java/src/main/native/src/check_nvcomp_output_sizes.cu b/java/src/main/native/src/check_nvcomp_output_sizes.cu
index 9d29e66ec59..8e0df7dd89a 100644
--- a/java/src/main/native/src/check_nvcomp_output_sizes.cu
+++ b/java/src/main/native/src/check_nvcomp_output_sizes.cu
@@ -13,20 +13,21 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "check_nvcomp_output_sizes.hpp"
+
 #include <cudf/utilities/error.hpp>
+
 #include <nvtx3/nvtx3.hpp>
 #include <thrust/device_ptr.h>
 #include <thrust/equal.h>
 
-#include "check_nvcomp_output_sizes.hpp"
-
 namespace {
 
 struct java_domain {
-  static constexpr char const *name{"Java"};
+  static constexpr char const* name{"Java"};
 };
 
-} // anonymous namespace
+}  // anonymous namespace
 
 namespace cudf {
 namespace java {
@@ -35,13 +36,17 @@ namespace java {
  * Check that the vector of expected uncompressed sizes matches the vector of actual compressed
  * sizes. Both vectors are assumed to be in device memory and contain num_chunks elements.
  */
-bool check_nvcomp_output_sizes(std::size_t const *dev_uncompressed_sizes,
-                               std::size_t const *dev_actual_uncompressed_sizes,
-                               std::size_t num_chunks, rmm::cuda_stream_view stream) {
+bool check_nvcomp_output_sizes(std::size_t const* dev_uncompressed_sizes,
+                               std::size_t const* dev_actual_uncompressed_sizes,
+                               std::size_t num_chunks,
+                               rmm::cuda_stream_view stream)
+{
   NVTX3_FUNC_RANGE_IN(java_domain);
-  return thrust::equal(rmm::exec_policy(stream), dev_uncompressed_sizes,
-                       dev_uncompressed_sizes + num_chunks, dev_actual_uncompressed_sizes);
+  return thrust::equal(rmm::exec_policy(stream),
+                       dev_uncompressed_sizes,
+                       dev_uncompressed_sizes + num_chunks,
+                       dev_actual_uncompressed_sizes);
 }
 
-} // namespace java
-} // namespace cudf
+}  // namespace java
+}  // namespace cudf
diff --git a/java/src/main/native/src/check_nvcomp_output_sizes.hpp b/java/src/main/native/src/check_nvcomp_output_sizes.hpp
index 00b36471a85..594be6c7c96 100644
--- a/java/src/main/native/src/check_nvcomp_output_sizes.hpp
+++ b/java/src/main/native/src/check_nvcomp_output_sizes.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,8 +26,9 @@ namespace java {
  * Check that the vector of expected uncompressed sizes matches the vector of actual compressed
  * sizes. Both vectors are assumed to be in device memory and contain num_chunks elements.
  */
-bool check_nvcomp_output_sizes(std::size_t const *dev_uncompressed_sizes,
-                               std::size_t const *dev_actual_uncompressed_sizes,
-                               std::size_t num_chunks, rmm::cuda_stream_view stream);
-} // namespace java
-} // namespace cudf
+bool check_nvcomp_output_sizes(std::size_t const* dev_uncompressed_sizes,
+                               std::size_t const* dev_actual_uncompressed_sizes,
+                               std::size_t num_chunks,
+                               rmm::cuda_stream_view stream);
+}  // namespace java
+}  // namespace cudf
diff --git a/java/src/main/native/src/csv_chunked_writer.hpp b/java/src/main/native/src/csv_chunked_writer.hpp
index 1f1e73a1a4b..ee05aa95328 100644
--- a/java/src/main/native/src/csv_chunked_writer.hpp
+++ b/java/src/main/native/src/csv_chunked_writer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,11 @@
  */
 #pragma once
 
-#include <cassert>
+#include "jni_writer_data_sink.hpp"
 
 #include <cudf/io/csv.hpp>
 
-#include "jni_writer_data_sink.hpp"
+#include <cassert>
 
 namespace cudf::jni::io {
 
@@ -27,17 +27,17 @@ namespace cudf::jni::io {
  * @brief Class to write multiple Tables into the jni_writer_data_sink.
  */
 class csv_chunked_writer {
-
   cudf::io::csv_writer_options _options;
   std::unique_ptr<cudf::jni::jni_writer_data_sink> _sink;
 
-  bool _first_write_completed = false; ///< Decides if header should be written.
+  bool _first_write_completed = false;  ///< Decides if header should be written.
 
-public:
+ public:
   explicit csv_chunked_writer(cudf::io::csv_writer_options options,
-                              std::unique_ptr<cudf::jni::jni_writer_data_sink> &sink)
-      : _options{options}, _sink{std::move(sink)} {
-    auto const &sink_info = _options.get_sink();
+                              std::unique_ptr<cudf::jni::jni_writer_data_sink>& sink)
+    : _options{options}, _sink{std::move(sink)}
+  {
+    auto const& sink_info = _options.get_sink();
     // Assert invariants.
     CUDF_EXPECTS(sink_info.type() != cudf::io::io_type::FILEPATH,
                  "Currently, chunked CSV writes to files is not supported.");
@@ -52,9 +52,10 @@ class csv_chunked_writer {
     CUDF_EXPECTS(sink_info.user_sinks()[0] == _sink.get(), "Sink mismatch.");
   }
 
-  void write(cudf::table_view const &table) {
+  void write(cudf::table_view const& table)
+  {
     if (_first_write_completed) {
-      _options.enable_include_header(false); // Don't write header after the first write.
+      _options.enable_include_header(false);  // Don't write header after the first write.
     }
 
     _options.set_table(table);
@@ -64,10 +65,11 @@ class csv_chunked_writer {
     _first_write_completed = true;
   }
 
-  void close() {
+  void close()
+  {
     // Flush pending writes to sink.
     _sink->flush();
   }
 };
 
-} // namespace cudf::jni::io
+}  // namespace cudf::jni::io
diff --git a/java/src/main/native/src/cudf_jni_apis.hpp b/java/src/main/native/src/cudf_jni_apis.hpp
index bd82bbd2899..022493f04ab 100644
--- a/java/src/main/native/src/cudf_jni_apis.hpp
+++ b/java/src/main/native/src/cudf_jni_apis.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,11 @@
  */
 #pragma once
 
+#include "jni_utils.hpp"
+
 #include <cudf/contiguous_split.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 
-#include "jni_utils.hpp"
-
 namespace cudf {
 namespace jni {
 
@@ -34,29 +34,31 @@ namespace jni {
  * @param table_result the table to convert for return
  * @param extra_columns columns not in the table that will be appended to the result.
  */
-jlongArray
-convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &table_result,
-                         std::vector<std::unique_ptr<cudf::column>> &&extra_columns = {});
+jlongArray convert_table_for_return(
+  JNIEnv* env,
+  std::unique_ptr<cudf::table>& table_result,
+  std::vector<std::unique_ptr<cudf::column>>&& extra_columns = {});
 
 /**
  * @copydoc convert_table_for_return(JNIEnv*, std::unique_ptr<cudf::table>&,
  *                                   std::vector<std::unique_ptr<cudf::column>>&&)
  */
-jlongArray
-convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &&table_result,
-                         std::vector<std::unique_ptr<cudf::column>> &&extra_columns = {});
+jlongArray convert_table_for_return(
+  JNIEnv* env,
+  std::unique_ptr<cudf::table>&& table_result,
+  std::vector<std::unique_ptr<cudf::column>>&& extra_columns = {});
 
 //
 // ContiguousTable APIs
 //
 
-bool cache_contiguous_table_jni(JNIEnv *env);
+bool cache_contiguous_table_jni(JNIEnv* env);
 
-void release_contiguous_table_jni(JNIEnv *env);
+void release_contiguous_table_jni(JNIEnv* env);
 
-jobject contiguous_table_from(JNIEnv *env, cudf::packed_columns &split, long row_count);
+jobject contiguous_table_from(JNIEnv* env, cudf::packed_columns& split, long row_count);
 
-native_jobjectArray<jobject> contiguous_table_array(JNIEnv *env, jsize length);
+native_jobjectArray<jobject> contiguous_table_array(JNIEnv* env, jsize length);
 
 /**
  * @brief Cache the JNI jclass and JNI jfield of Java `ContigSplitGroupByResult`
@@ -64,14 +66,14 @@ native_jobjectArray<jobject> contiguous_table_array(JNIEnv *env, jsize length);
  * @param env the JNI Env pointer
  * @return if success
  */
-bool cache_contig_split_group_by_result_jni(JNIEnv *env);
+bool cache_contig_split_group_by_result_jni(JNIEnv* env);
 
 /**
  * @brief Release the JNI jclass and JNI jfield of Java `ContigSplitGroupByResult`
  *
  * @param env the JNI Env pointer
  */
-void release_contig_split_group_by_result_jni(JNIEnv *env);
+void release_contig_split_group_by_result_jni(JNIEnv* env);
 
 /**
  * @brief Construct a Java `ContigSplitGroupByResult` from contiguous tables.
@@ -80,7 +82,7 @@ void release_contig_split_group_by_result_jni(JNIEnv *env);
  * @param groups the contiguous tables
  * @return a Java `ContigSplitGroupByResult`
  */
-jobject contig_split_group_by_result_from(JNIEnv *env, jobjectArray &groups);
+jobject contig_split_group_by_result_from(JNIEnv* env, jobjectArray& groups);
 
 /**
  * @brief Construct a Java `ContigSplitGroupByResult` from contiguous tables.
@@ -90,8 +92,9 @@ jobject contig_split_group_by_result_from(JNIEnv *env, jobjectArray &groups);
  * @param groups the contiguous tables
  * @return a Java `ContigSplitGroupByResult`
  */
-jobject contig_split_group_by_result_from(JNIEnv *env, jobjectArray &groups,
-                                          jlongArray &uniq_key_columns);
+jobject contig_split_group_by_result_from(JNIEnv* env,
+                                          jobjectArray& groups,
+                                          jlongArray& uniq_key_columns);
 
 //
 // HostMemoryBuffer APIs
@@ -100,22 +103,24 @@ jobject contig_split_group_by_result_from(JNIEnv *env, jobjectArray &groups,
 /**
  * Allocate a HostMemoryBuffer
  */
-jobject allocate_host_buffer(JNIEnv *env, jlong amount, jboolean prefer_pinned,
+jobject allocate_host_buffer(JNIEnv* env,
+                             jlong amount,
+                             jboolean prefer_pinned,
                              jobject host_memory_allocator);
 
 /**
  * Get the address of a HostMemoryBuffer
  */
-jlong get_host_buffer_address(JNIEnv *env, jobject buffer);
+jlong get_host_buffer_address(JNIEnv* env, jobject buffer);
 
 /**
  * Get the length of a HostMemoryBuffer
  */
-jlong get_host_buffer_length(JNIEnv *env, jobject buffer);
+jlong get_host_buffer_length(JNIEnv* env, jobject buffer);
 
 // Get the JNI environment, attaching the current thread to the JVM if necessary. If the thread
 // needs to be attached, the thread will automatically detach when the thread terminates.
-JNIEnv *get_jni_env(JavaVM *jvm);
+JNIEnv* get_jni_env(JavaVM* jvm);
 
 /** Set the device to use for cudf */
 void set_cudf_device(int device);
@@ -125,22 +130,22 @@ void set_cudf_device(int device);
  * set the device, throw an exception, or do nothing depending on how the application has
  * configured it via Cuda.setAutoSetDeviceMode.
  */
-void auto_set_device(JNIEnv *env);
+void auto_set_device(JNIEnv* env);
 
 /**
  * Fills all the bytes in the buffer 'buf' with 'value'.
  * The operation has not necessarily completed when this returns, but it could overlap with
  * operations occurring on other streams.
  */
-void device_memset_async(JNIEnv *env, rmm::device_buffer &buf, char value);
+void device_memset_async(JNIEnv* env, rmm::device_buffer& buf, char value);
 
 //
 // DataSource APIs
 //
 
-bool cache_data_source_jni(JNIEnv *env);
+bool cache_data_source_jni(JNIEnv* env);
 
-void release_data_source_jni(JNIEnv *env);
+void release_data_source_jni(JNIEnv* env);
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
diff --git a/java/src/main/native/src/dtype_utils.hpp b/java/src/main/native/src/dtype_utils.hpp
index 4de8a94182c..90408782dd0 100644
--- a/java/src/main/native/src/dtype_utils.hpp
+++ b/java/src/main/native/src/dtype_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,15 +15,16 @@
  */
 #pragma once
 
-#include <jni.h>
-
 #include <cudf/types.hpp>
 
+#include <jni.h>
+
 namespace cudf {
 namespace jni {
 
 // convert a timestamp type to the corresponding duration type
-inline cudf::data_type timestamp_to_duration(cudf::data_type dt) {
+inline cudf::data_type timestamp_to_duration(cudf::data_type dt)
+{
   cudf::type_id duration_type_id;
   switch (dt.id()) {
     case cudf::type_id::TIMESTAMP_DAYS: duration_type_id = cudf::type_id::DURATION_DAYS; break;
@@ -44,13 +45,15 @@ inline cudf::data_type timestamp_to_duration(cudf::data_type dt) {
   return cudf::data_type(duration_type_id);
 }
 
-inline bool is_decimal_type(cudf::type_id n_type) {
+inline bool is_decimal_type(cudf::type_id n_type)
+{
   return n_type == cudf::type_id::DECIMAL32 || n_type == cudf::type_id::DECIMAL64 ||
          n_type == cudf::type_id::DECIMAL128;
 }
 
 // create data_type including scale for decimal type
-inline cudf::data_type make_data_type(jint out_dtype, jint scale) {
+inline cudf::data_type make_data_type(jint out_dtype, jint scale)
+{
   cudf::type_id n_type = static_cast<cudf::type_id>(out_dtype);
   cudf::data_type n_data_type;
   if (is_decimal_type(n_type)) {
@@ -61,5 +64,5 @@ inline cudf::data_type make_data_type(jint out_dtype, jint scale) {
   return n_data_type;
 }
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
diff --git a/java/src/main/native/src/jni_compiled_expr.hpp b/java/src/main/native/src/jni_compiled_expr.hpp
index 74010f71011..dad2c33b731 100644
--- a/java/src/main/native/src/jni_compiled_expr.hpp
+++ b/java/src/main/native/src/jni_compiled_expr.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,11 @@
 
 #pragma once
 
+#include <cudf/ast/expressions.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <memory>
+#include <utility>
 #include <vector>
 
 namespace cudf {
@@ -38,29 +43,31 @@ class compiled_expr {
   /** GPU scalar instances that correspond to literal nodes */
   std::vector<std::unique_ptr<cudf::scalar>> scalars;
 
-public:
-  cudf::ast::literal &add_literal(std::unique_ptr<cudf::ast::literal> literal_ptr,
-                                  std::unique_ptr<cudf::scalar> scalar_ptr) {
+ public:
+  cudf::ast::literal& add_literal(std::unique_ptr<cudf::ast::literal> literal_ptr,
+                                  std::unique_ptr<cudf::scalar> scalar_ptr)
+  {
     expressions.push_back(std::move(literal_ptr));
     scalars.push_back(std::move(scalar_ptr));
-    return static_cast<cudf::ast::literal &>(*expressions.back());
+    return static_cast<cudf::ast::literal&>(*expressions.back());
   }
 
-  cudf::ast::column_reference &
-  add_column_ref(std::unique_ptr<cudf::ast::column_reference> ref_ptr) {
+  cudf::ast::column_reference& add_column_ref(std::unique_ptr<cudf::ast::column_reference> ref_ptr)
+  {
     expressions.push_back(std::move(ref_ptr));
-    return static_cast<cudf::ast::column_reference &>(*expressions.back());
+    return static_cast<cudf::ast::column_reference&>(*expressions.back());
   }
 
-  cudf::ast::operation &add_operation(std::unique_ptr<cudf::ast::operation> expr_ptr) {
+  cudf::ast::operation& add_operation(std::unique_ptr<cudf::ast::operation> expr_ptr)
+  {
     expressions.push_back(std::move(expr_ptr));
-    return static_cast<cudf::ast::operation &>(*expressions.back());
+    return static_cast<cudf::ast::operation&>(*expressions.back());
   }
 
   /** Return the expression node at the top of the tree */
-  cudf::ast::expression &get_top_expression() const { return *expressions.back(); }
+  cudf::ast::expression& get_top_expression() const { return *expressions.back(); }
 };
 
-} // namespace ast
-} // namespace jni
-} // namespace cudf
+}  // namespace ast
+}  // namespace jni
+}  // namespace cudf
diff --git a/java/src/main/native/src/jni_writer_data_sink.hpp b/java/src/main/native/src/jni_writer_data_sink.hpp
index efac6112c25..52756266beb 100644
--- a/java/src/main/native/src/jni_writer_data_sink.hpp
+++ b/java/src/main/native/src/jni_writer_data_sink.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,56 +15,53 @@
  */
 #pragma once
 
-#include <cudf/io/data_sink.hpp>
-
 #include "cudf_jni_apis.hpp"
 #include "jni_utils.hpp"
 
+#include <cudf/io/data_sink.hpp>
+
 namespace cudf::jni {
 
-constexpr long MINIMUM_WRITE_BUFFER_SIZE = 10 * 1024 * 1024; // 10 MB
+constexpr long MINIMUM_WRITE_BUFFER_SIZE = 10 * 1024 * 1024;  // 10 MB
 
 class jni_writer_data_sink final : public cudf::io::data_sink {
-public:
-  explicit jni_writer_data_sink(JNIEnv *env, jobject callback, jobject host_memory_allocator) {
-    if (env->GetJavaVM(&jvm) < 0) {
-      throw std::runtime_error("GetJavaVM failed");
-    }
+ public:
+  explicit jni_writer_data_sink(JNIEnv* env, jobject callback, jobject host_memory_allocator)
+  {
+    if (env->GetJavaVM(&jvm) < 0) { throw std::runtime_error("GetJavaVM failed"); }
 
     jclass cls = env->GetObjectClass(callback);
-    if (cls == nullptr) {
-      throw cudf::jni::jni_exception("class not found");
-    }
+    if (cls == nullptr) { throw cudf::jni::jni_exception("class not found"); }
 
     handle_buffer_method =
-        env->GetMethodID(cls, "handleBuffer", "(Lai/rapids/cudf/HostMemoryBuffer;J)V");
-    if (handle_buffer_method == nullptr) {
-      throw cudf::jni::jni_exception("handleBuffer method");
-    }
+      env->GetMethodID(cls, "handleBuffer", "(Lai/rapids/cudf/HostMemoryBuffer;J)V");
+    if (handle_buffer_method == nullptr) { throw cudf::jni::jni_exception("handleBuffer method"); }
 
-    this->callback = add_global_ref(env, callback);
+    this->callback              = add_global_ref(env, callback);
     this->host_memory_allocator = add_global_ref(env, host_memory_allocator);
   }
 
-  virtual ~jni_writer_data_sink() {
+  virtual ~jni_writer_data_sink()
+  {
     // This should normally be called by a JVM thread. If the JVM environment is missing then this
     // is likely being triggered by the C++ runtime during shutdown. In that case the JVM may
     // already be destroyed and this thread should not try to attach to get an environment.
-    JNIEnv *env = nullptr;
-    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
-      callback = del_global_ref(env, callback);
-      current_buffer = del_global_ref(env, current_buffer);
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+      callback              = del_global_ref(env, callback);
+      current_buffer        = del_global_ref(env, current_buffer);
       host_memory_allocator = del_global_ref(env, host_memory_allocator);
     }
-    callback = nullptr;
-    current_buffer = nullptr;
+    callback              = nullptr;
+    current_buffer        = nullptr;
     host_memory_allocator = nullptr;
   }
 
-  void host_write(void const *data, size_t size) override {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
-    long left_to_copy = static_cast<long>(size);
-    const char *copy_from = static_cast<const char *>(data);
+  void host_write(void const* data, size_t size) override
+  {
+    JNIEnv* env           = cudf::jni::get_jni_env(jvm);
+    long left_to_copy     = static_cast<long>(size);
+    const char* copy_from = static_cast<const char*>(data);
     while (left_to_copy > 0) {
       long buffer_amount_available = current_buffer_len - current_buffer_written;
       if (buffer_amount_available <= 0) {
@@ -73,8 +70,8 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
         buffer_amount_available = current_buffer_len - current_buffer_written;
       }
       long amount_to_copy =
-          left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available;
-      char *copy_to = current_buffer_data + current_buffer_written;
+        left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available;
+      char* copy_to = current_buffer_data + current_buffer_written;
 
       std::memcpy(copy_to, copy_from, amount_to_copy);
       copy_from = copy_from + amount_to_copy;
@@ -86,10 +83,11 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
 
   bool supports_device_write() const override { return true; }
 
-  void device_write(void const *gpu_data, size_t size, rmm::cuda_stream_view stream) override {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
-    long left_to_copy = static_cast<long>(size);
-    const char *copy_from = static_cast<const char *>(gpu_data);
+  void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
+  {
+    JNIEnv* env           = cudf::jni::get_jni_env(jvm);
+    long left_to_copy     = static_cast<long>(size);
+    const char* copy_from = static_cast<const char*>(gpu_data);
     while (left_to_copy > 0) {
       long buffer_amount_available = current_buffer_len - current_buffer_written;
       if (buffer_amount_available <= 0) {
@@ -99,11 +97,11 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
         buffer_amount_available = current_buffer_len - current_buffer_written;
       }
       long amount_to_copy =
-          left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available;
-      char *copy_to = current_buffer_data + current_buffer_written;
+        left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available;
+      char* copy_to = current_buffer_data + current_buffer_written;
 
-      CUDF_CUDA_TRY(cudaMemcpyAsync(copy_to, copy_from, amount_to_copy, cudaMemcpyDeviceToHost,
-                                    stream.value()));
+      CUDF_CUDA_TRY(cudaMemcpyAsync(
+        copy_to, copy_from, amount_to_copy, cudaMemcpyDeviceToHost, stream.value()));
 
       copy_from = copy_from + amount_to_copy;
       current_buffer_written += amount_to_copy;
@@ -113,20 +111,23 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
     stream.synchronize();
   }
 
-  std::future<void> device_write_async(void const *gpu_data, size_t size,
-                                       rmm::cuda_stream_view stream) override {
+  std::future<void> device_write_async(void const* gpu_data,
+                                       size_t size,
+                                       rmm::cuda_stream_view stream) override
+  {
     // Call the sync version until figuring out how to write asynchronously.
     device_write(gpu_data, size, stream);
     return std::async(std::launch::deferred, [] {});
   }
 
-  void flush() override {
+  void flush() override
+  {
     if (current_buffer_written > 0) {
-      JNIEnv *env = cudf::jni::get_jni_env(jvm);
+      JNIEnv* env = cudf::jni::get_jni_env(jvm);
       handle_buffer(env, current_buffer, current_buffer_written);
-      current_buffer = del_global_ref(env, current_buffer);
-      current_buffer_len = 0;
-      current_buffer_data = nullptr;
+      current_buffer         = del_global_ref(env, current_buffer);
+      current_buffer_len     = 0;
+      current_buffer_data    = nullptr;
       current_buffer_written = 0;
     }
   }
@@ -135,36 +136,34 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
 
   void set_alloc_size(long size) { this->alloc_size = size; }
 
-private:
-  void rotate_buffer(JNIEnv *env) {
-    if (current_buffer != nullptr) {
-      handle_buffer(env, current_buffer, current_buffer_written);
-    }
-    current_buffer = del_global_ref(env, current_buffer);
-    jobject tmp_buffer = allocate_host_buffer(env, alloc_size, true, host_memory_allocator);
-    current_buffer = add_global_ref(env, tmp_buffer);
-    current_buffer_len = get_host_buffer_length(env, current_buffer);
-    current_buffer_data = reinterpret_cast<char *>(get_host_buffer_address(env, current_buffer));
+ private:
+  void rotate_buffer(JNIEnv* env)
+  {
+    if (current_buffer != nullptr) { handle_buffer(env, current_buffer, current_buffer_written); }
+    current_buffer         = del_global_ref(env, current_buffer);
+    jobject tmp_buffer     = allocate_host_buffer(env, alloc_size, true, host_memory_allocator);
+    current_buffer         = add_global_ref(env, tmp_buffer);
+    current_buffer_len     = get_host_buffer_length(env, current_buffer);
+    current_buffer_data    = reinterpret_cast<char*>(get_host_buffer_address(env, current_buffer));
     current_buffer_written = 0;
   }
 
-  void handle_buffer(JNIEnv *env, jobject buffer, jlong len) {
+  void handle_buffer(JNIEnv* env, jobject buffer, jlong len)
+  {
     env->CallVoidMethod(callback, handle_buffer_method, buffer, len);
-    if (env->ExceptionCheck()) {
-      throw std::runtime_error("handleBuffer threw an exception");
-    }
+    if (env->ExceptionCheck()) { throw std::runtime_error("handleBuffer threw an exception"); }
   }
 
-  JavaVM *jvm;
+  JavaVM* jvm;
   jobject callback;
   jmethodID handle_buffer_method;
-  jobject current_buffer = nullptr;
-  char *current_buffer_data = nullptr;
-  long current_buffer_len = 0;
+  jobject current_buffer      = nullptr;
+  char* current_buffer_data   = nullptr;
+  long current_buffer_len     = 0;
   long current_buffer_written = 0;
-  size_t total_written = 0;
-  long alloc_size = MINIMUM_WRITE_BUFFER_SIZE;
+  size_t total_written        = 0;
+  long alloc_size             = MINIMUM_WRITE_BUFFER_SIZE;
   jobject host_memory_allocator;
 };
 
-} // namespace cudf::jni
+}  // namespace cudf::jni
diff --git a/java/src/main/native/src/maps_column_view.cu b/java/src/main/native/src/maps_column_view.cu
index d5600e48a5c..d3ee52c074c 100644
--- a/java/src/main/native/src/maps_column_view.cu
+++ b/java/src/main/native/src/maps_column_view.cu
@@ -18,14 +18,17 @@
 #include <cudf/lists/detail/contains.hpp>
 #include <cudf/lists/detail/extract.hpp>
 #include <cudf/scalar/scalar.hpp>
-#include <maps_column_view.hpp>
+
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <maps_column_view.hpp>
+
 namespace cudf::jni {
 
 namespace {
-column_view make_lists(column_view const &lists_child, lists_column_view const &lists_of_structs) {
+column_view make_lists(column_view const& lists_child, lists_column_view const& lists_of_structs)
+{
   return column_view{data_type{type_id::LIST},
                      lists_of_structs.size(),
                      nullptr,
@@ -34,12 +37,13 @@ column_view make_lists(column_view const &lists_child, lists_column_view const &
                      lists_of_structs.offset(),
                      {lists_of_structs.offsets(), lists_child}};
 }
-} // namespace
+}  // namespace
 
-maps_column_view::maps_column_view(lists_column_view const &lists_of_structs,
+maps_column_view::maps_column_view(lists_column_view const& lists_of_structs,
                                    rmm::cuda_stream_view stream)
-    : keys_{make_lists(lists_of_structs.child().child(0), lists_of_structs)},
-      values_{make_lists(lists_of_structs.child().child(1), lists_of_structs)} {
+  : keys_{make_lists(lists_of_structs.child().child(0), lists_of_structs)},
+    values_{make_lists(lists_of_structs.child().child(1), lists_of_structs)}
+{
   auto const structs = lists_of_structs.child();
   CUDF_EXPECTS(structs.type().id() == type_id::STRUCT,
                "maps_column_view input must have exactly 1 child (STRUCT) column.");
@@ -48,66 +52,78 @@ maps_column_view::maps_column_view(lists_column_view const &lists_of_structs,
 }
 
 template <typename KeyT>
-std::unique_ptr<column> get_values_for_impl(maps_column_view const &maps_view,
-                                            KeyT const &lookup_keys, rmm::cuda_stream_view stream,
-                                            rmm::device_async_resource_ref mr) {
-  auto const keys_ = maps_view.keys();
+std::unique_ptr<column> get_values_for_impl(maps_column_view const& maps_view,
+                                            KeyT const& lookup_keys,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::device_async_resource_ref mr)
+{
+  auto const keys_   = maps_view.keys();
   auto const values_ = maps_view.values();
   CUDF_EXPECTS(lookup_keys.type().id() == keys_.child().type().id(),
                "Lookup keys must have the same type as the keys of the map column.");
-  auto key_indices =
-      lists::detail::index_of(keys_, lookup_keys, lists::duplicate_find_option::FIND_LAST, stream,
-                              rmm::mr::get_current_device_resource());
-  auto constexpr absent_offset = size_type{-1};
+  auto key_indices              = lists::detail::index_of(keys_,
+                                             lookup_keys,
+                                             lists::duplicate_find_option::FIND_LAST,
+                                             stream,
+                                             rmm::mr::get_current_device_resource());
+  auto constexpr absent_offset  = size_type{-1};
   auto constexpr nullity_offset = std::numeric_limits<size_type>::min();
-  thrust::replace(rmm::exec_policy(stream), key_indices->mutable_view().template begin<size_type>(),
-                  key_indices->mutable_view().template end<size_type>(), absent_offset,
+  thrust::replace(rmm::exec_policy(stream),
+                  key_indices->mutable_view().template begin<size_type>(),
+                  key_indices->mutable_view().template end<size_type>(),
+                  absent_offset,
                   nullity_offset);
   return lists::detail::extract_list_element(values_, key_indices->view(), stream, mr);
 }
 
-std::unique_ptr<column> maps_column_view::get_values_for(column_view const &lookup_keys,
+std::unique_ptr<column> maps_column_view::get_values_for(column_view const& lookup_keys,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::device_async_resource_ref mr) const {
+                                                         rmm::device_async_resource_ref mr) const
+{
   CUDF_EXPECTS(lookup_keys.size() == size(),
                "Lookup keys must have the same size as the map column.");
 
   return get_values_for_impl(*this, lookup_keys, stream, mr);
 }
 
-std::unique_ptr<column> maps_column_view::get_values_for(scalar const &lookup_key,
+std::unique_ptr<column> maps_column_view::get_values_for(scalar const& lookup_key,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::device_async_resource_ref mr) const {
+                                                         rmm::device_async_resource_ref mr) const
+{
   return get_values_for_impl(*this, lookup_key, stream, mr);
 }
 
 template <typename KeyT>
-std::unique_ptr<column> contains_impl(maps_column_view const &maps_view, KeyT const &lookup_keys,
+std::unique_ptr<column> contains_impl(maps_column_view const& maps_view,
+                                      KeyT const& lookup_keys,
                                       rmm::cuda_stream_view stream,
-                                      rmm::device_async_resource_ref mr) {
+                                      rmm::device_async_resource_ref mr)
+{
   auto const keys = maps_view.keys();
   CUDF_EXPECTS(lookup_keys.type().id() == keys.child().type().id(),
                "Lookup keys must have the same type as the keys of the map column.");
   auto const contains =
-      lists::detail::contains(keys, lookup_keys, stream, rmm::mr::get_current_device_resource());
+    lists::detail::contains(keys, lookup_keys, stream, rmm::mr::get_current_device_resource());
   // Replace nulls with BOOL8{false};
   auto const scalar_false = numeric_scalar<bool>{false, true, stream};
   return detail::replace_nulls(contains->view(), scalar_false, stream, mr);
 }
 
-std::unique_ptr<column> maps_column_view::contains(column_view const &lookup_keys,
+std::unique_ptr<column> maps_column_view::contains(column_view const& lookup_keys,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::device_async_resource_ref mr) const {
+                                                   rmm::device_async_resource_ref mr) const
+{
   CUDF_EXPECTS(lookup_keys.size() == size(),
                "Lookup keys must have the same size as the map column.");
 
   return contains_impl(*this, lookup_keys, stream, mr);
 }
 
-std::unique_ptr<column> maps_column_view::contains(scalar const &lookup_key,
+std::unique_ptr<column> maps_column_view::contains(scalar const& lookup_key,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::device_async_resource_ref mr) const {
+                                                   rmm::device_async_resource_ref mr) const
+{
   return contains_impl(*this, lookup_key, stream, mr);
 }
 
-} // namespace cudf::jni
+}  // namespace cudf::jni
diff --git a/java/src/main/native/src/nvtx_common.hpp b/java/src/main/native/src/nvtx_common.hpp
index 8b5b04f3370..69bcdfb8521 100644
--- a/java/src/main/native/src/nvtx_common.hpp
+++ b/java/src/main/native/src/nvtx_common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,8 +20,8 @@ namespace cudf {
 namespace jni {
 
 struct java_domain {
-  static constexpr char const *name{"Java"};
+  static constexpr char const* name{"Java"};
 };
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf

From 9dac831cb51c90e6d30d6b6c6366b8afd01047aa Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 18 Apr 2024 05:55:25 -1000
Subject: [PATCH 094/272] Clean up __cuda_array_interface__ handling in
 as_column (#15477)

Removes some unnecessary type cast checking and NaT handling as cupy does not support datelike types https://github.com/cupy/cupy/issues/2622

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15477
---
 python/cudf/cudf/core/column/column.py | 98 +++++++-------------------
 1 file changed, 24 insertions(+), 74 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index b5890f7aad4..7e48552742c 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1679,27 +1679,6 @@ def build_categorical_column(
     return cast("cudf.core.column.CategoricalColumn", result)
 
 
-def _make_copy_replacing_NaT_with_null(column):
-    """Return a copy with NaT values replaced with nulls."""
-    if np.issubdtype(column.dtype, np.timedelta64):
-        na_value = np.timedelta64("NaT", column.time_unit)
-    elif np.issubdtype(column.dtype, np.datetime64):
-        na_value = np.datetime64("NaT", column.time_unit)
-    else:
-        raise ValueError("This type does not support replacing NaT with null.")
-
-    null = column_empty_like(column, masked=True, newsize=1)
-    out_col = cudf._lib.replace.replace(
-        column,
-        build_column(
-            as_buffer(np.array([na_value], dtype=column.dtype).view("|u1")),
-            dtype=column.dtype,
-        ),
-        null,
-    )
-    return out_col
-
-
 def check_invalid_array(shape: tuple, dtype):
     """Invalid ndarrays properties that are not supported"""
     if len(shape) > 1:
@@ -1782,50 +1761,30 @@ def as_column(
         return arbitrary
     elif hasattr(arbitrary, "__cuda_array_interface__"):
         desc = arbitrary.__cuda_array_interface__
-        shape = desc["shape"]
-        current_dtype = np.dtype(desc["typestr"])
-
-        check_invalid_array(shape, current_dtype)
-
-        arb_dtype = cudf.dtype(current_dtype)
+        check_invalid_array(desc["shape"], np.dtype(desc["typestr"]))
 
         if desc.get("mask", None) is not None:
             # Extract and remove the mask from arbitrary before
             # passing to cupy.asarray
-            mask = _mask_from_cuda_array_interface_desc(arbitrary)
-            arbitrary = SimpleNamespace(__cuda_array_interface__=desc.copy())
-            arbitrary.__cuda_array_interface__["mask"] = None
-            desc = arbitrary.__cuda_array_interface__
+            cai_copy = desc.copy()
+            mask = _mask_from_cuda_array_interface_desc(
+                arbitrary, cai_copy.pop("mask")
+            )
+            arbitrary = SimpleNamespace(__cuda_array_interface__=cai_copy)
         else:
             mask = None
 
         arbitrary = cupy.asarray(arbitrary)
-
-        if arb_dtype != current_dtype:
-            arbitrary = arbitrary.astype(arb_dtype)
-            current_dtype = arb_dtype
-
-        if (
-            desc["strides"] is not None
-            and not (arbitrary.itemsize,) == arbitrary.strides
-        ):
-            arbitrary = cupy.ascontiguousarray(arbitrary)
+        arbitrary = cupy.ascontiguousarray(arbitrary)
 
         data = as_buffer(arbitrary, exposed=cudf.get_option("copy_on_write"))
-        col = build_column(data, dtype=current_dtype, mask=mask)
-
+        col = build_column(data, dtype=arbitrary.dtype, mask=mask)
+        if (
+            nan_as_null or (mask is None and nan_as_null is None)
+        ) and col.dtype.kind == "f":
+            col = col.nans_to_nulls()
         if dtype is not None:
             col = col.astype(dtype)
-
-        if isinstance(col, cudf.core.column.CategoricalColumn):
-            return col
-        elif np.issubdtype(col.dtype, np.floating):
-            if nan_as_null or (mask is None and nan_as_null is None):
-                mask = libcudf.transform.nans_to_nulls(col.fillna(np.nan))
-                col = col.set_mask(mask)
-        elif np.issubdtype(col.dtype, np.datetime64):
-            if nan_as_null or (mask is None and nan_as_null is None):
-                col = _make_copy_replacing_NaT_with_null(col)
         return col
 
     elif isinstance(arbitrary, (pa.Array, pa.ChunkedArray)):
@@ -2222,27 +2181,18 @@ def as_column(
         return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)
 
 
-def _mask_from_cuda_array_interface_desc(obj) -> Union[Buffer, None]:
-    desc = obj.__cuda_array_interface__
-    mask = desc.get("mask", None)
-
-    if mask is not None:
-        desc = mask.__cuda_array_interface__
-        ptr = desc["data"][0]
-        nelem = desc["shape"][0]
-        typestr = desc["typestr"]
-        typecode = typestr[1]
-        if typecode == "t":
-            mask_size = bitmask_allocation_size_bytes(nelem)
-            mask = as_buffer(data=ptr, size=mask_size, owner=obj)
-        elif typecode == "b":
-            col = as_column(mask)
-            mask = bools_to_mask(col)
-        else:
-            raise NotImplementedError(
-                f"Cannot infer mask from typestr {typestr}"
-            )
-    return mask
+def _mask_from_cuda_array_interface_desc(obj, cai_mask) -> Buffer:
+    desc = cai_mask.__cuda_array_interface__
+    typestr = desc["typestr"]
+    typecode = typestr[1]
+    if typecode == "t":
+        mask_size = bitmask_allocation_size_bytes(desc["shape"][0])
+        return as_buffer(data=desc["data"][0], size=mask_size, owner=obj)
+    elif typecode == "b":
+        col = as_column(cai_mask)
+        return bools_to_mask(col)
+    else:
+        raise NotImplementedError(f"Cannot infer mask from typestr {typestr}")
 
 
 def serialize_columns(columns) -> Tuple[List[dict], List]:

From a9350669b607810a66f5ecc2133703c2a8e18c7c Mon Sep 17 00:00:00 2001
From: Ray Bell <rayjohnbell0@gmail.com>
Date: Thu, 18 Apr 2024 12:37:01 -0400
Subject: [PATCH 095/272] add correct labels to pandas_function_request.md
 (#15381)

The should correct the labels added on a the "Request a Missing Pandas Function" template.

"? - Needs Triage" -> "Needs Triage" (https://github.com/rapidsai/cudf/issues?q=is%3Aopen+is%3Aissue+label%3A%22Needs+Triage%22)

and adds the "pandas" label (https://github.com/rapidsai/cudf/issues?q=is%3Aopen+is%3Aissue+label%3Apandas)

Authors:
  - Ray Bell (https://github.com/raybellwaves)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cudf/pull/15381
---
 .github/ISSUE_TEMPLATE/pandas_function_request.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ISSUE_TEMPLATE/pandas_function_request.md b/.github/ISSUE_TEMPLATE/pandas_function_request.md
index 1cecca72953..19f1377dfe7 100644
--- a/.github/ISSUE_TEMPLATE/pandas_function_request.md
+++ b/.github/ISSUE_TEMPLATE/pandas_function_request.md
@@ -2,7 +2,7 @@
 name: Request a Missing Pandas Function
 about: Request GPU support for a function executed on the CPU in pandas accelerator mode.
 title: "[FEA]"
-labels: "? - Needs Triage, feature request"
+labels: "Needs Triage, feature request, cudf.pandas"
 assignees: ''
 
 ---

From cb8e434e9f2abec93af5877af062688069e5d164 Mon Sep 17 00:00:00 2001
From: Ray Bell <rayjohnbell0@gmail.com>
Date: Thu, 18 Apr 2024 13:22:58 -0400
Subject: [PATCH 096/272] DOC: add pandas intersphinx mapping (#15531)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR add pandas to intersphinx mapping to make it easy to link to pandas docs from the RAPIDS docs.

There is likely other opportunities to use the pandas intersphinx mapping e.g. https://github.com/rapidsai/cudf/pull/15383 but I think they can be subsequent PRs.

I've tested this locally and confirm it works as expected (i.e. the note in the docstring at https://docs.rapids.ai/api/cudf/stable/user_guide/api_docs/api/cudf.dataframe.query/#cudf.DataFrame.query is now hyperlinked to https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html

![Screenshot 2024-04-14 at 2 15 33 AM](https://github.com/rapidsai/cudf/assets/17162724/193076e2-202e-4e74-9305-be1dbcdfa82b)

Apologies about the other linting. I can revert if need be

Authors:
  - Ray Bell (https://github.com/raybellwaves)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15531
---
 docs/cudf/source/conf.py           | 24 ++++++++++++++++--------
 python/cudf/cudf/core/dataframe.py |  2 +-
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index b891ff99d47..bcefa3fbdf8 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -142,8 +142,6 @@ def clean_all_xml_files(path):
                 tree.write(fn)
 
 
-
-
 # Breathe Configuration
 breathe_projects = {"libcudf": "../../../cpp/doxygen/xml"}
 for project_path in breathe_projects.values():
@@ -187,7 +185,9 @@ def clean_all_xml_files(path):
 # The short X.Y version.
 version = f"{CUDF_VERSION.major:02}.{CUDF_VERSION.minor:02}"
 # The full version.
-release = f"{CUDF_VERSION.major:02}.{CUDF_VERSION.minor:02}.{CUDF_VERSION.micro:02}"
+release = (
+    f"{CUDF_VERSION.major:02}.{CUDF_VERSION.minor:02}.{CUDF_VERSION.micro:02}"
+)
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -308,7 +308,10 @@ def clean_all_xml_files(path):
     "dlpack": ("https://dmlc.github.io/dlpack/latest/", None),
     "nanoarrow": ("https://arrow.apache.org/nanoarrow/latest", None),
     "numpy": ("https://numpy.org/doc/stable", None),
-    "pandas": ("https://pandas.pydata.org/docs/", None),
+    "pandas": (
+        "https://pandas.pydata.org/pandas-docs/stable/",
+        None,
+    ),
     "pyarrow": ("https://arrow.apache.org/docs/", None),
     "python": ("https://docs.python.org/3", None),
     "rmm": ("https://docs.rapids.ai/api/rmm/nightly/", None),
@@ -380,7 +383,7 @@ def _generate_namespaces(namespaces):
     "type_id",
     # Unknown base types
     "int32_t",
-    "void"
+    "void",
 }
 
 
@@ -448,9 +451,14 @@ def _cached_intersphinx_lookup(env, node, contnode):
 
 def on_missing_reference(app, env, node, contnode):
     # These variables are defined outside the function to speed up the build.
-    global _all_namespaces, _names_to_skip_in_cpp, \
-        _names_to_skip_in_pylibcudf, _intersphinx_extra_prefixes, \
-        _domain_objects, _prefixed_domain_objects, _intersphinx_cache
+    global \
+        _all_namespaces, \
+        _names_to_skip_in_cpp, \
+        _names_to_skip_in_pylibcudf, \
+        _intersphinx_extra_prefixes, \
+        _domain_objects, \
+        _prefixed_domain_objects, \
+        _intersphinx_cache
 
     # Precompute and cache domains for faster lookups
     if _domain_objects is None:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 2a4f93c1716..99e4588d608 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4320,7 +4320,7 @@ def query(self, expr, local_dict=None):
         """
         Query with a boolean expression using Numba to compile a GPU kernel.
 
-        See pandas.DataFrame.query.
+        See :meth:`pandas.DataFrame.query`.
 
         Parameters
         ----------

From b8d003e9e992cd0621368a698f76336ad87f7180 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 18 Apr 2024 12:48:05 -0700
Subject: [PATCH 097/272] Fix CMake files in libcudf C++ examples to use
 existing libcudf build if present (#15348)

This PR fixes the CMake artifacts for libcudf examples and includes CI updates to create executable `libcudf-example` conda package to run from CI

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Robert Maynard (https://github.com/robertmaynard)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cudf/pull/15348
---
 .gitignore                                    |  1 +
 ci/release/update-version.sh                  |  2 +-
 ci/run_cudf_examples.sh                       | 20 +++++++++++++
 ci/test_cpp.sh                                |  6 ++++
 ci/test_cpp_common.sh                         |  2 +-
 .../libcudf/install_libcudf_example.sh        |  5 ++--
 conda/recipes/libcudf/meta.yaml               |  2 +-
 cpp/examples/basic/CMakeLists.txt             | 11 ++++++-
 cpp/examples/build.sh                         | 29 +++++++++++++++++++
 cpp/examples/fetch_dependencies.cmake         | 11 +++++--
 cpp/examples/nested_types/CMakeLists.txt      | 11 ++++++-
 cpp/examples/set_cuda_architecture.cmake      | 28 ++++++++++++++++++
 cpp/examples/strings/CMakeLists.txt           | 15 ++++++++--
 cpp/examples/versions.cmake                   | 15 ++++++++++
 14 files changed, 146 insertions(+), 12 deletions(-)
 create mode 100755 ci/run_cudf_examples.sh
 mode change 100644 => 100755 ci/test_cpp_common.sh
 create mode 100644 cpp/examples/set_cuda_architecture.cmake
 create mode 100644 cpp/examples/versions.cmake

diff --git a/.gitignore b/.gitignore
index 471d4100458..313bb1c3789 100644
--- a/.gitignore
+++ b/.gitignore
@@ -78,6 +78,7 @@ CMakeFiles/
 Debug
 build/
 cpp/build/
+cpp/examples/*/install/
 cpp/include/cudf/ipc_generated/*.h
 cpp/thirdparty/googletest/
 
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 7cacdfd39c3..99f9c698217 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -70,7 +70,7 @@ sed_runner "s/version == ${CURRENT_SHORT_TAG}/version == ${NEXT_SHORT_TAG}/g" RE
 sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md
 
 # Libcudf examples update
-sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/fetch_dependencies.cmake
+sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/versions.cmake
 
 # CI files
 for FILE in .github/workflows/*.yaml; do
diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh
new file mode 100755
index 00000000000..71af6446748
--- /dev/null
+++ b/ci/run_cudf_examples.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -uo pipefail
+
+EXITCODE=0
+trap "EXITCODE=1" ERR
+
+# Support customizing the examples' install location
+cd "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/examples/libcudf/";
+
+compute-sanitizer --tool memcheck basic_example
+
+compute-sanitizer --tool memcheck deduplication
+
+compute-sanitizer --tool memcheck custom_optimized names.csv
+compute-sanitizer --tool memcheck custom_prealloc names.csv
+compute-sanitizer --tool memcheck custom_with_malloc names.csv
+
+exit ${EXITCODE}
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index 995c8d7d71f..7865849bb74 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -17,6 +17,12 @@ rapids-logger "Run libcudf gtests"
 ./ci/run_cudf_ctests.sh -j20
 SUITEERROR=$?
 
+if (( ${SUITEERROR} == 0 )); then
+    rapids-logger "Run libcudf examples"
+    ./ci/run_cudf_examples.sh
+    SUITEERROR=$?
+fi
+
 if (( ${SUITEERROR} == 0 )); then
     rapids-logger "Run libcudf_kafka gtests"
     ./ci/run_cudf_kafka_ctests.sh -j20
diff --git a/ci/test_cpp_common.sh b/ci/test_cpp_common.sh
old mode 100644
new mode 100755
index e1b2a367187..da847137a2b
--- a/ci/test_cpp_common.sh
+++ b/ci/test_cpp_common.sh
@@ -31,7 +31,7 @@ rapids-print-env
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
-  libcudf libcudf_kafka libcudf-tests
+  libcudf libcudf_kafka libcudf-tests libcudf-example
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/conda/recipes/libcudf/install_libcudf_example.sh b/conda/recipes/libcudf/install_libcudf_example.sh
index e249688a03b..1a52dec99e3 100644
--- a/conda/recipes/libcudf/install_libcudf_example.sh
+++ b/conda/recipes/libcudf/install_libcudf_example.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
-./cpp/examples/build.sh
+# build and install libcudf examples
+./cpp/examples/build.sh --install
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 63eb83084dd..3af0b7885c3 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -195,7 +195,7 @@ outputs:
       license: Apache-2.0
       license_family: APACHE
       license_file: LICENSE
-      summary: libcudf_example library
+      summary: libcudf example executables
   - name: libcudf-tests
     version: {{ version }}
     script: install_libcudf_tests.sh
diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt
index 759a43b5627..a3fe699667a 100644
--- a/cpp/examples/basic/CMakeLists.txt
+++ b/cpp/examples/basic/CMakeLists.txt
@@ -1,7 +1,13 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 cmake_minimum_required(VERSION 3.26.4)
 
+include(../set_cuda_architecture.cmake)
+
+# initialize cuda architecture
+rapids_cuda_init_architectures(basic_example)
+rapids_cuda_set_architectures(RAPIDS)
+
 project(
   basic_example
   VERSION 0.0.1
@@ -14,3 +20,6 @@ include(../fetch_dependencies.cmake)
 add_executable(basic_example src/process_csv.cpp)
 target_link_libraries(basic_example PRIVATE cudf::cudf)
 target_compile_features(basic_example PRIVATE cxx_std_17)
+
+install(TARGETS basic_example DESTINATION bin/examples/libcudf)
+install(FILES ${CMAKE_CURRENT_LIST_DIR}/4stock_5day.csv DESTINATION bin/examples/libcudf)
diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh
index 424da35ad18..9802c876930 100755
--- a/cpp/examples/build.sh
+++ b/cpp/examples/build.sh
@@ -8,9 +8,34 @@ set -euo pipefail
 
 # Parallelism control
 PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
+# Installation disabled by default
+INSTALL_EXAMPLES=false
+
+# Check for -i or --install flags to enable installation
+ARGS=$(getopt -o i --long install -- "$@")
+eval set -- "$ARGS"
+while [ : ]; do
+  case "$1" in
+    -i | --install)
+        INSTALL_EXAMPLES=true
+        shift
+        ;;
+    --) shift;
+        break
+        ;;
+  esac
+done
 
 # Root of examples
 EXAMPLES_DIR=$(dirname "$(realpath "$0")")
+
+# Set up default libcudf build directory and install prefix if conda build
+if [ "${CONDA_BUILD:-"0"}" == "1" ]; then
+  LIB_BUILD_DIR="${LIB_BUILD_DIR:-${SRC_DIR/cpp/build}}"
+  INSTALL_PREFIX="${INSTALL_PREFIX:-${PREFIX}}"
+fi
+
+# libcudf build directory
 LIB_BUILD_DIR=${LIB_BUILD_DIR:-$(readlink -f "${EXAMPLES_DIR}/../build")}
 
 ################################################################################
@@ -25,6 +50,10 @@ build_example() {
   cmake -S ${example_dir} -B ${build_dir} -Dcudf_ROOT="${LIB_BUILD_DIR}"
   # Build
   cmake --build ${build_dir} -j${PARALLEL_LEVEL}
+  # Install if needed
+  if [ "$INSTALL_EXAMPLES" = true ]; then
+    cmake --install ${build_dir} --prefix ${INSTALL_PREFIX:-${example_dir}/install}
+  fi
 }
 
 build_example basic
diff --git a/cpp/examples/fetch_dependencies.cmake b/cpp/examples/fetch_dependencies.cmake
index e4c11bbdeca..851405caf55 100644
--- a/cpp/examples/fetch_dependencies.cmake
+++ b/cpp/examples/fetch_dependencies.cmake
@@ -11,7 +11,10 @@
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
 # =============================================================================
-set(CPM_DOWNLOAD_VERSION v0.35.3)
+
+include(${CMAKE_CURRENT_LIST_DIR}/versions.cmake)
+
+set(CPM_DOWNLOAD_VERSION v0.38.5)
 file(
   DOWNLOAD
   https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake
@@ -19,9 +22,11 @@ file(
 )
 include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
 
-set(CUDF_TAG branch-24.06)
+# find or build it via CPM
 CPMFindPackage(
-  NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
+  NAME cudf
+  FIND_PACKAGE_ARGUMENTS "PATHS ${cudf_ROOT} ${cudf_ROOT}/latest" GIT_REPOSITORY
+                         https://github.com/rapidsai/cudf
   GIT_TAG ${CUDF_TAG}
   GIT_SHALLOW
     TRUE
diff --git a/cpp/examples/nested_types/CMakeLists.txt b/cpp/examples/nested_types/CMakeLists.txt
index cb9430db237..8a900f6b5ae 100644
--- a/cpp/examples/nested_types/CMakeLists.txt
+++ b/cpp/examples/nested_types/CMakeLists.txt
@@ -1,7 +1,13 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 cmake_minimum_required(VERSION 3.26.4)
 
+include(../set_cuda_architecture.cmake)
+
+# initialize cuda architecture
+rapids_cuda_init_architectures(nested_types)
+rapids_cuda_set_architectures(RAPIDS)
+
 project(
   nested_types
   VERSION 0.0.1
@@ -14,3 +20,6 @@ include(../fetch_dependencies.cmake)
 add_executable(deduplication deduplication.cpp)
 target_link_libraries(deduplication PRIVATE cudf::cudf)
 target_compile_features(deduplication PRIVATE cxx_std_17)
+
+install(TARGETS deduplication DESTINATION bin/examples/libcudf)
+install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.json DESTINATION bin/examples/libcudf)
diff --git a/cpp/examples/set_cuda_architecture.cmake b/cpp/examples/set_cuda_architecture.cmake
new file mode 100644
index 00000000000..bed6cd2f357
--- /dev/null
+++ b/cpp/examples/set_cuda_architecture.cmake
@@ -0,0 +1,28 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+include(${CMAKE_CURRENT_LIST_DIR}/versions.cmake)
+
+if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/libcudf_cpp_examples_RAPIDS.cmake)
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/${CUDF_TAG}/RAPIDS.cmake
+       ${CMAKE_CURRENT_BINARY_DIR}/libcudf_cpp_examples_RAPIDS.cmake
+  )
+endif()
+include(${CMAKE_CURRENT_BINARY_DIR}/libcudf_cpp_examples_RAPIDS.cmake)
+
+include(rapids-cmake)
+include(rapids-cpm)
+include(rapids-cuda)
+include(rapids-export)
+include(rapids-find)
diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt
index c90fa9dde16..a5654870544 100644
--- a/cpp/examples/strings/CMakeLists.txt
+++ b/cpp/examples/strings/CMakeLists.txt
@@ -1,7 +1,13 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 cmake_minimum_required(VERSION 3.26.4)
 
+include(../set_cuda_architecture.cmake)
+
+# initialize cuda architecture
+rapids_cuda_init_architectures(strings_examples)
+rapids_cuda_set_architectures(RAPIDS)
+
 project(
   strings_examples
   VERSION 0.0.1
@@ -12,22 +18,27 @@ include(../fetch_dependencies.cmake)
 
 list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
 
-#
 add_executable(libcudf_apis libcudf_apis.cpp)
 target_compile_features(libcudf_apis PRIVATE cxx_std_17)
 target_link_libraries(libcudf_apis PRIVATE cudf::cudf nvToolsExt)
+install(TARGETS libcudf_apis DESTINATION bin/examples/libcudf)
 
 add_executable(custom_with_malloc custom_with_malloc.cu)
 target_compile_features(custom_with_malloc PRIVATE cxx_std_17)
 target_compile_options(custom_with_malloc PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
 target_link_libraries(custom_with_malloc PRIVATE cudf::cudf nvToolsExt)
+install(TARGETS custom_with_malloc DESTINATION bin/examples/libcudf)
 
 add_executable(custom_prealloc custom_prealloc.cu)
 target_compile_features(custom_prealloc PRIVATE cxx_std_17)
 target_compile_options(custom_prealloc PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
 target_link_libraries(custom_prealloc PRIVATE cudf::cudf nvToolsExt)
+install(TARGETS custom_prealloc DESTINATION bin/examples/libcudf)
 
 add_executable(custom_optimized custom_optimized.cu)
 target_compile_features(custom_optimized PRIVATE cxx_std_17)
 target_compile_options(custom_optimized PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
 target_link_libraries(custom_optimized PRIVATE cudf::cudf nvToolsExt)
+install(TARGETS custom_optimized DESTINATION bin/examples/libcudf)
+
+install(FILES ${CMAKE_CURRENT_LIST_DIR}/names.csv DESTINATION bin/examples/libcudf)
diff --git a/cpp/examples/versions.cmake b/cpp/examples/versions.cmake
new file mode 100644
index 00000000000..dff66b4d7d8
--- /dev/null
+++ b/cpp/examples/versions.cmake
@@ -0,0 +1,15 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(CUDF_TAG branch-24.06)

From 7b9e8158d38a250217b328ee005d9cf8581bec9f Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 18 Apr 2024 17:07:04 -0400
Subject: [PATCH 098/272] Fix deprecation warnings for json legacy reader
 (#15563)

Fixes deprecation warnings caused by changes in #15558
Most are in the `json_test.cpp` and appear like this
```
[150+7+50=206] Building CXX object tests/CMakeFiles/JSON_TEST.dir/io/json_test.cpp.o
/cudf/cpp/tests/io/json_test.cpp: In member function 'virtual void JsonReaderParamTest_BasicJsonLines_Test::TestBody()':
/cudf/cpp/tests/io/json_test.cpp:320:14: warning: 'cudf::io::json_reader_options_builder& cudf::io::json_reader_options_builder::legacy(bool)' is deprecated [-Wdeprecated-declarations]
  317 |     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
      |     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  318 |       .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<double>()})
      |       ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  319 |       .lines(true)
      |       ~~~~~~~~~~~~
  320 |       .legacy(is_legacy_test(test_opt));
      |       ~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from /cudf/cpp/tests/io/json_test.cpp:30:

```
Compiler warnings usually result in errors when building libcudf.

This PR removes calls and references to legacy JSON reader features where possible.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15563
---
 cpp/src/io/json/read_json.cu       |   4 +
 cpp/tests/io/json_test.cpp         | 164 +++++++++--------------------
 cpp/tests/streams/io/json_test.cpp |   3 +-
 3 files changed, 54 insertions(+), 117 deletions(-)

diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 3ea8639641c..81ef3a51afc 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -210,9 +210,13 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
 {
   CUDF_FUNC_RANGE();
 
+  // TODO remove this if-statement once legacy is removed
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
   if (reader_opts.is_enabled_legacy()) {
     return legacy::read_json(sources, reader_opts, stream, mr);
   }
+#pragma GCC diagnostic pop
 
   if (reader_opts.get_byte_range_offset() != 0 or reader_opts.get_byte_range_size() != 0) {
     CUDF_EXPECTS(reader_opts.is_enabled_lines(),
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index bae71d3c2a8..81cedf3d23e 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -169,26 +169,15 @@ struct JsonReaderTest : public cudf::test::BaseFixture {};
  * @brief Enum class to be used to specify the test case of parametrized tests
  */
 enum class json_test_t {
-  // Run test with the existing JSON lines reader using row-orient input data
-  legacy_lines_row_orient,
-  // Run test with the existing JSON lines reader using record-orient input data
-  legacy_lines_record_orient,
   // Run test with the nested JSON lines reader using record-orient input data
   json_experimental_record_orient,
   // Run test with the nested JSON lines reader using row-orient input data
   json_experimental_row_orient
 };
 
-constexpr bool is_legacy_test(json_test_t test_opt)
-{
-  return test_opt == json_test_t::legacy_lines_row_orient or
-         test_opt == json_test_t::legacy_lines_record_orient;
-}
-
 constexpr bool is_row_orient_test(json_test_t test_opt)
 {
-  return test_opt == json_test_t::legacy_lines_row_orient or
-         test_opt == json_test_t::json_experimental_row_orient;
+  return test_opt == json_test_t::json_experimental_row_orient;
 }
 
 /**
@@ -198,17 +187,10 @@ struct JsonReaderParamTest : public cudf::test::BaseFixture,
                              public testing::WithParamInterface<json_test_t> {};
 
 /**
- * @brief Test fixture for parametrized JSON reader tests, testing record orient-only for legacy
- * JSON lines reader and the nested reader
+ * @brief Test fixture for parametrized JSON reader tests with both orients
  */
-struct JsonReaderDualTest : public cudf::test::BaseFixture,
-                            public testing::WithParamInterface<json_test_t> {};
-
-/**
- * @brief Test fixture for parametrized JSON reader tests that only tests the new nested JSON reader
- */
-struct JsonReaderNoLegacy : public cudf::test::BaseFixture,
-                            public testing::WithParamInterface<json_test_t> {};
+struct JsonReaderRecordTest : public cudf::test::BaseFixture,
+                              public testing::WithParamInterface<json_test_t> {};
 
 /**
  * @brief Generates a JSON lines string that uses the record orient
@@ -244,9 +226,7 @@ struct JsonFixedPointReaderTest : public JsonReaderTest {};
 
 template <typename DecimalType>
 struct JsonValidFixedPointReaderTest : public JsonFixedPointReaderTest<DecimalType> {
-  void run_test(std::vector<std::string> const& reference_strings,
-                numeric::scale_type scale,
-                bool use_legacy_parser)
+  void run_test(std::vector<std::string> const& reference_strings, numeric::scale_type scale)
   {
     cudf::test::strings_column_wrapper const strings(reference_strings.begin(),
                                                      reference_strings.end());
@@ -263,8 +243,7 @@ struct JsonValidFixedPointReaderTest : public JsonFixedPointReaderTest<DecimalTy
     cudf::io::json_reader_options const in_opts =
       cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
         .dtypes({data_type{type_to_id<DecimalType>(), scale}})
-        .lines(true)
-        .legacy(use_legacy_parser);
+        .lines(true);
 
     auto const result      = cudf::io::read_json(in_opts);
     auto const result_view = result.tbl->view();
@@ -277,8 +256,8 @@ struct JsonValidFixedPointReaderTest : public JsonFixedPointReaderTest<DecimalTy
   void run_tests(std::vector<std::string> const& reference_strings, numeric::scale_type scale)
   {
     // Test both parsers
-    run_test(reference_strings, scale, false);
-    run_test(reference_strings, scale, true);
+    run_test(reference_strings, scale);
+    run_test(reference_strings, scale);
   }
 };
 
@@ -288,22 +267,13 @@ TYPED_TEST_SUITE(JsonValidFixedPointReaderTest, cudf::test::FixedPointTypes);
 // Parametrize qualifying JSON tests for executing both nested reader and legacy JSON lines reader
 INSTANTIATE_TEST_CASE_P(JsonReaderParamTest,
                         JsonReaderParamTest,
-                        ::testing::Values(json_test_t::legacy_lines_row_orient,
-                                          json_test_t::legacy_lines_record_orient,
-                                          json_test_t::json_experimental_record_orient,
+                        ::testing::Values(json_test_t::json_experimental_record_orient,
                                           json_test_t::json_experimental_row_orient));
 
 // Parametrize qualifying JSON tests for executing both nested reader and legacy JSON lines reader
-INSTANTIATE_TEST_CASE_P(JsonReaderDualTest,
-                        JsonReaderDualTest,
-                        ::testing::Values(json_test_t::legacy_lines_record_orient,
-                                          json_test_t::json_experimental_record_orient));
-
-// Parametrize qualifying JSON tests for executing nested reader only
-INSTANTIATE_TEST_CASE_P(JsonReaderNoLegacy,
-                        JsonReaderNoLegacy,
-                        ::testing::Values(json_test_t::json_experimental_row_orient,
-                                          json_test_t::json_experimental_record_orient));
+INSTANTIATE_TEST_CASE_P(JsonReaderRecordTest,
+                        JsonReaderRecordTest,
+                        ::testing::Values(json_test_t::json_experimental_record_orient));
 
 TEST_P(JsonReaderParamTest, BasicJsonLines)
 {
@@ -316,8 +286,7 @@ TEST_P(JsonReaderParamTest, BasicJsonLines)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
       .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<double>()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 2);
@@ -359,8 +328,7 @@ TEST_P(JsonReaderParamTest, FloatingPoint)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes({dtype<float>()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -384,8 +352,7 @@ TEST_P(JsonReaderParamTest, JsonLinesStrings)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
       .dtypes({{"2", dtype<cudf::string_view>()}, {"0", dtype<int32_t>()}, {"1", dtype<double>()}})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -454,8 +421,7 @@ TEST_P(JsonReaderParamTest, MultiColumn)
                dtype<int64_t>(),
                dtype<float>(),
                dtype<double>()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   auto const view = result.tbl->view();
@@ -504,8 +470,7 @@ TEST_P(JsonReaderParamTest, Booleans)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes({dtype<bool>()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   // Booleans are the same (integer) data type, but valued at 0 or 1
@@ -548,8 +513,7 @@ TEST_P(JsonReaderParamTest, Dates)
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}})
       .lines(true)
-      .dayfirst(true)
-      .legacy(is_legacy_test(test_opt));
+      .dayfirst(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   auto const view = result.tbl->view();
@@ -604,8 +568,7 @@ TEST_P(JsonReaderParamTest, Durations)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes({data_type{type_id::DURATION_NANOSECONDS}})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   auto const view = result.tbl->view();
@@ -642,8 +605,7 @@ TEST_P(JsonReaderParamTest, JsonLinesDtypeInference)
 
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -678,9 +640,7 @@ TEST_P(JsonReaderParamTest, JsonLinesFileInput)
   outfile.close();
 
   cudf::io::json_reader_options in_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{fname})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+    cudf::io::json_reader_options::builder(cudf::io::source_info{fname}).lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -707,7 +667,6 @@ TEST_F(JsonReaderTest, JsonLinesByteRange)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{fname})
       .lines(true)
-      .legacy(true)  // Support in new reader coming in https://github.com/rapidsai/cudf/pull/12498
       .byte_range_offset(11)
       .byte_range_size(20);
 
@@ -722,18 +681,15 @@ TEST_F(JsonReaderTest, JsonLinesByteRange)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int64_wrapper{{3000, 4000, 5000}});
 }
 
-TEST_P(JsonReaderDualTest, JsonLinesObjects)
+TEST_P(JsonReaderRecordTest, JsonLinesObjects)
 {
-  auto const test_opt     = GetParam();
   const std::string fname = temp_env->get_temp_dir() + "JsonLinesObjectsTest.json";
   std::ofstream outfile(fname, std::ofstream::out);
   outfile << " {\"co\\\"l1\" : 1, \"col2\" : 2.0} \n";
   outfile.close();
 
   cudf::io::json_reader_options in_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{fname})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+    cudf::io::json_reader_options::builder(cudf::io::source_info{fname}).lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -741,7 +697,7 @@ TEST_P(JsonReaderDualTest, JsonLinesObjects)
   EXPECT_EQ(result.tbl->num_rows(), 1);
 
   EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
-  EXPECT_EQ(result.metadata.schema_info[0].name, is_legacy_test(test_opt) ? "co\\\"l1" : "co\"l1");
+  EXPECT_EQ(result.metadata.schema_info[0].name, "co\"l1");
   EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64);
   EXPECT_EQ(result.metadata.schema_info[1].name, "col2");
 
@@ -749,14 +705,13 @@ TEST_P(JsonReaderDualTest, JsonLinesObjects)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{2.0}});
 }
 
-TEST_P(JsonReaderDualTest, JsonLinesObjectsStrings)
+TEST_P(JsonReaderRecordTest, JsonLinesObjectsStrings)
 {
   auto const test_opt    = GetParam();
   auto test_json_objects = [test_opt](std::string const& data) {
     cudf::io::json_reader_options in_options =
       cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
-        .lines(true)
-        .legacy(is_legacy_test(test_opt));
+        .lines(true);
 
     cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -786,17 +741,15 @@ TEST_P(JsonReaderDualTest, JsonLinesObjectsStrings)
     "{\"col3\":\"bbb\", \"col1\":200, \"col2\":2.2}\n");
 }
 
-TEST_P(JsonReaderDualTest, JsonLinesObjectsMissingData)
+TEST_P(JsonReaderRecordTest, JsonLinesObjectsMissingData)
 {
-  auto const test_opt = GetParam();
-  // Note: columns will be ordered based on which fields appear first
+  //  Note: columns will be ordered based on which fields appear first
   std::string const data =
     "{              \"col2\":1.1, \"col3\":\"aaa\"}\n"
     "{\"col1\":200,               \"col3\":\"bbb\"}\n";
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -823,17 +776,15 @@ TEST_P(JsonReaderDualTest, JsonLinesObjectsMissingData)
                                  cudf::test::strings_column_wrapper({"aaa", "bbb"}));
 }
 
-TEST_P(JsonReaderDualTest, JsonLinesObjectsOutOfOrder)
+TEST_P(JsonReaderRecordTest, JsonLinesObjectsOutOfOrder)
 {
-  auto const test_opt = GetParam();
   std::string const data =
     "{\"col1\":100, \"col2\":1.1, \"col3\":\"aaa\"}\n"
     "{\"col3\":\"bbb\", \"col1\":200, \"col2\":2.2}\n";
 
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -919,8 +870,7 @@ TEST_F(JsonReaderTest, ArrowFileSource)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{&arrow_source})
       .dtypes({dtype<int8_t>()})
-      .lines(true)
-      .legacy(true);  // Support in new reader coming in https://github.com/rapidsai/cudf/pull/12498
+      .lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -952,8 +902,7 @@ TEST_P(JsonReaderParamTest, InvalidFloatingPoint)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes({dtype<float>()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 1);
@@ -972,8 +921,7 @@ TEST_P(JsonReaderParamTest, StringInference)
 
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.c_str(), data.size()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 1);
@@ -1054,9 +1002,7 @@ TEST_P(JsonReaderParamTest, ParseInRangeIntegers)
     outfile << line.str();
   }
   cudf::io::json_reader_options in_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}).lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -1158,9 +1104,7 @@ TEST_P(JsonReaderParamTest, ParseOutOfRangeIntegers)
     outfile << line.str();
   }
   cudf::io::json_reader_options in_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}).lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -1198,9 +1142,7 @@ TEST_P(JsonReaderParamTest, JsonLinesMultipleFileInputs)
   outfile2.close();
 
   cudf::io::json_reader_options in_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file2}})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file2}}).lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -1217,7 +1159,7 @@ TEST_P(JsonReaderParamTest, JsonLinesMultipleFileInputs)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2, 3.3, 4.4}});
 }
 
-TEST_P(JsonReaderNoLegacy, JsonLinesMultipleFileInputsNoNL)
+TEST_P(JsonReaderParamTest, JsonLinesMultipleFileInputsNoNL)
 {
   auto const test_opt = GetParam();
   // Strings for the two separate input files in row-orient that do not end with a newline
@@ -1239,9 +1181,7 @@ TEST_P(JsonReaderNoLegacy, JsonLinesMultipleFileInputsNoNL)
   outfile2.close();
 
   cudf::io::json_reader_options in_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file2}})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file2}}).lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -1258,15 +1198,16 @@ TEST_P(JsonReaderNoLegacy, JsonLinesMultipleFileInputsNoNL)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2, 3.3, 4.4}});
 }
 
-TEST_F(JsonReaderTest, BadDtypeParams)
+// This can be removed once the legacy option has been removed.
+// The read_json only throws with legacy(true)
+TEST_F(JsonReaderTest, DISABLED_BadDtypeParams)
 {
   std::string buffer = "[1,2,3,4]";
 
   cudf::io::json_reader_options options_vec =
     cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
       .lines(true)
-      .dtypes({dtype<int8_t>()})
-      .legacy(true);
+      .dtypes({dtype<int8_t>()});
 
   // should throw because there are four columns and only one dtype
   EXPECT_THROW(cudf::io::read_json(options_vec), cudf::logic_error);
@@ -1274,7 +1215,6 @@ TEST_F(JsonReaderTest, BadDtypeParams)
   cudf::io::json_reader_options options_map =
     cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
       .lines(true)
-      .legacy(true)
       .dtypes(std::map<std::string, cudf::data_type>{{"0", dtype<int8_t>()},
                                                      {"1", dtype<int8_t>()},
                                                      {"2", dtype<int8_t>()},
@@ -1328,7 +1268,6 @@ TEST_F(JsonReaderTest, JsonExperimentalLines)
   auto const table = cudf::io::read_json(json_lines_options);
 
   // Read test data via legacy, non-nested JSON lines reader
-  json_lines_options.enable_legacy(true);
   auto const legacy_reader_table = cudf::io::read_json(json_lines_options);
 
   // Verify that the data read via non-nested JSON lines reader matches the data read via nested
@@ -1433,8 +1372,7 @@ TEST_F(JsonReaderTest, ErrorStrings)
   cudf::io::json_reader_options const in_opts =
     cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
       .dtypes({data_type{cudf::type_id::STRING}})
-      .lines(true)
-      .legacy(false);
+      .lines(true);
 
   auto const result      = cudf::io::read_json(in_opts);
   auto const result_view = result.tbl->view().column(0);
@@ -1506,7 +1444,6 @@ TEST_F(JsonReaderTest, ExperimentalLinesNoOmissions)
     auto const table = cudf::io::read_json(json_lines_options);
 
     // Read test data via legacy, non-nested JSON lines reader
-    json_lines_options.enable_legacy(true);
     auto const legacy_reader_table = cudf::io::read_json(json_lines_options);
 
     // Verify that the data read via non-nested JSON lines reader matches the data read via
@@ -1592,8 +1529,7 @@ TEST_P(JsonReaderParamTest, JsonDtypeSchema)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
       .dtypes(dtype_schema)
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -1789,8 +1725,7 @@ TEST_P(JsonReaderParamTest, JsonDtypeParsing)
     cudf::io::json_reader_options in_options =
       cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
         .dtypes(dtype_schema)
-        .lines(true)
-        .legacy(is_legacy_test(test_opt));
+        .lines(true);
 
     cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -1824,13 +1759,12 @@ TYPED_TEST(JsonValidFixedPointReaderTest, SingleColumnPositiveScale)
 
 TYPED_TEST(JsonFixedPointReaderTest, EmptyValues)
 {
-  auto const buffer = std::string{"{\"col0\":}"};
+  auto const buffer = std::string{"{\"col0\":\"\"}"};
 
   cudf::io::json_reader_options const in_opts =
     cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
       .dtypes({data_type{type_to_id<TypeParam>(), 0}})
-      .lines(true)
-      .legacy(true);  // Legacy behavior; not aligned with JSON specs
+      .lines(true);
 
   auto const result      = cudf::io::read_json(in_opts);
   auto const result_view = result.tbl->view();
@@ -1838,7 +1772,7 @@ TYPED_TEST(JsonFixedPointReaderTest, EmptyValues)
   ASSERT_EQ(result_view.num_columns(), 1);
   EXPECT_EQ(result_view.num_rows(), 1);
   EXPECT_EQ(result.metadata.schema_info[0].name, "col0");
-  EXPECT_EQ(result_view.column(0).null_count(), 1);
+  EXPECT_EQ(result_view.column(0).null_count(), 0);
 }
 
 TEST_F(JsonReaderTest, UnsupportedMultipleFileInputs)
diff --git a/cpp/tests/streams/io/json_test.cpp b/cpp/tests/streams/io/json_test.cpp
index 21da19a5a38..f98e685ed0c 100644
--- a/cpp/tests/streams/io/json_test.cpp
+++ b/cpp/tests/streams/io/json_test.cpp
@@ -37,8 +37,7 @@ TEST_F(JSONTest, JSONreader)
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
       .dtypes(std::vector<cudf::data_type>{cudf::data_type{cudf::type_id::INT32},
                                            cudf::data_type{cudf::type_id::FLOAT64}})
-      .lines(true)
-      .legacy(true);
+      .lines(true);
   cudf::io::table_with_metadata result =
     cudf::io::read_json(in_options, cudf::test::get_default_stream());
 }

From d1b92e2ec3b943a99299db24873a89fe31e3c0e3 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 18 Apr 2024 18:17:27 -0400
Subject: [PATCH 099/272] Large strings support in regex replace APIs (#15524)

Updates the `replace_re()` and `replace_with_backrefs()` internal logic to support large strings.
These functions use a regex-specific version of make-strings-children.

Depends on #15363

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15524
---
 cpp/src/strings/regex/utilities.cuh    | 19 +++++++++----------
 cpp/src/strings/replace/backref_re.cuh |  7 ++++---
 cpp/src/strings/replace/replace_re.cu  |  7 ++++---
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh
index cfe53937e66..afbfe9de049 100644
--- a/cpp/src/strings/regex/utilities.cuh
+++ b/cpp/src/strings/regex/utilities.cuh
@@ -19,8 +19,10 @@
 #include "strings/regex/regex.cuh"
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -116,10 +118,8 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
                            rmm::cuda_stream_view stream,
                            rmm::device_async_resource_ref mr)
 {
-  auto offsets = make_numeric_column(
-    data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
-  auto d_offsets             = offsets->mutable_view().template data<int32_t>();
-  size_and_exec_fn.d_offsets = d_offsets;
+  auto output_sizes        = rmm::device_uvector<size_type>(strings_count, stream);
+  size_and_exec_fn.d_sizes = output_sizes.data();
 
   auto [buffer_size, thread_count] = d_prog.compute_strided_working_memory(strings_count);
 
@@ -133,12 +133,11 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
     for_each_kernel<<<grid.num_blocks, grid.num_threads_per_block, shmem_size, stream.value()>>>(
       size_and_exec_fn, d_prog, strings_count);
   }
-
-  auto const char_bytes =
-    cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
-  CUDF_EXPECTS(char_bytes <= std::numeric_limits<size_type>::max(),
-               "Size of output exceeds the column size limit",
-               std::overflow_error);
+  // Convert the sizes to offsets
+  auto [offsets, char_bytes] = cudf::strings::detail::make_offsets_child_column(
+    output_sizes.begin(), output_sizes.end(), stream, mr);
+  size_and_exec_fn.d_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
 
   // Now build the chars column
   rmm::device_uvector<char> chars(char_bytes, stream, mr);
diff --git a/cpp/src/strings/replace/backref_re.cuh b/cpp/src/strings/replace/backref_re.cuh
index edd85f29e6c..b5b75cf8f40 100644
--- a/cpp/src/strings/replace/backref_re.cuh
+++ b/cpp/src/strings/replace/backref_re.cuh
@@ -45,13 +45,14 @@ struct backrefs_fn {
   string_view const d_repl;  // string replacement template
   Iterator backrefs_begin;
   Iterator backrefs_end;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str  = d_strings.element<string_view>(idx);
@@ -113,7 +114,7 @@ struct backrefs_fn {
       thrust::copy_n(
         thrust::seq, in_ptr + itr.byte_offset(), d_str.size_bytes() - itr.byte_offset(), out_ptr);
     } else {
-      d_offsets[idx] = nbytes;
+      d_sizes[idx] = nbytes;
     }
   }
 };
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index 1290302340b..fd988855424 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -43,13 +43,14 @@ struct replace_regex_fn {
   column_device_view const d_strings;
   string_view const d_repl;
   size_type const maxrepl;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -90,7 +91,7 @@ struct replace_regex_fn {
                      d_str.size_bytes() - last_pos.byte_offset(),  //             ^   ^
                      out_ptr);
     } else {
-      d_offsets[idx] = nbytes;
+      d_sizes[idx] = nbytes;
     }
   }
 };

From e0c4280e44d25006dca37d5e2e6c7f77dce3fd56 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Thu, 18 Apr 2024 16:50:25 -0700
Subject: [PATCH 100/272] Add option to Parquet writer to skip compressing
 individual columns (#15411)

#15081 added the ability to select per-column encodings in the Parquet writer. Some Parquet encodings (e.g `DELTA_BINARY_PACKED`) do not mix well with compression (see [PARQUET-2414](https://issues.apache.org/jira/browse/PARQUET-2414) for example). This PR adds the ability to turn off compression for select columns. This uses the same mechanism as encoding selection, so an example use would be:
```c++
  cudf::io::table_input_metadata table_metadata(table);
  table_metadata.column_metadata[0]
    .set_name("int_delta_binary")
    .set_encoding(cudf::io::column_encoding::DELTA_BINARY_PACKED)
    .set_skip_compression(true);
```

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15411
---
 cpp/include/cudf/io/types.hpp        | 21 ++++++++++++++
 cpp/src/io/parquet/page_enc.cu       |  5 +++-
 cpp/src/io/parquet/parquet_gpu.hpp   |  1 +
 cpp/src/io/parquet/writer_impl.cu    |  4 +++
 cpp/tests/io/parquet_writer_test.cpp | 42 ++++++++++++++++++++++++++++
 5 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 64d627483e6..65d4a4417f0 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -602,6 +602,7 @@ class column_in_metadata {
   bool _list_column_is_map  = false;
   bool _use_int96_timestamp = false;
   bool _output_as_binary    = false;
+  bool _skip_compression    = false;
   std::optional<uint8_t> _decimal_precision;
   std::optional<int32_t> _parquet_field_id;
   std::vector<column_in_metadata> children;
@@ -722,6 +723,19 @@ class column_in_metadata {
     return *this;
   }
 
+  /**
+   * @brief Specifies whether this column should not be compressed regardless of the compression
+   * codec specified for the file.
+   *
+   * @param skip If `true` do not compress this column
+   * @return this for chaining
+   */
+  column_in_metadata& set_skip_compression(bool skip) noexcept
+  {
+    _skip_compression = skip;
+    return *this;
+  }
+
   /**
    * @brief Sets the encoding to use for this column.
    *
@@ -844,6 +858,13 @@ class column_in_metadata {
    */
   [[nodiscard]] bool is_enabled_output_as_binary() const noexcept { return _output_as_binary; }
 
+  /**
+   * @brief Get whether to skip compressing this column
+   *
+   * @return Boolean indicating whether to skip compression of this column
+   */
+  [[nodiscard]] bool is_enabled_skip_compression() const noexcept { return _skip_compression; }
+
   /**
    * @brief Get the encoding that was set for this column.
    *
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 114e47b325b..2db6dc4270d 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -1591,7 +1591,9 @@ __device__ void finish_page_encode(state_buf* s,
     }
     pages[blockIdx.x] = s->page;
     if (not comp_results.empty()) {
-      comp_results[blockIdx.x]   = {0, compression_status::FAILURE};
+      auto const status =
+        s->col.skip_compression ? compression_status::SKIPPED : compression_status::FAILURE;
+      comp_results[blockIdx.x]   = {0, status};
       pages[blockIdx.x].comp_res = &comp_results[blockIdx.x];
     }
   }
@@ -2495,6 +2497,7 @@ CUDF_KERNEL void __launch_bounds__(decide_compression_block_size)
     if (auto comp_res = curr_page.comp_res; comp_res != nullptr) {
       auto const lvl_bytes = curr_page.is_v2() ? curr_page.level_bytes() : 0;
       compressed_data_size += comp_res->bytes_written + lvl_bytes;
+      // TODO: would this be better as a ballot?
       if (comp_res->status != compression_status::SUCCESS) {
         atomicOr(&compression_error[warp_id], 1);
       }
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 200a8ec9ddb..b165c60b2cf 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -475,6 +475,7 @@ struct parquet_column_device_view : stats_column_desc {
                                //!< nullability of parent_column. May be different from
                                //!< col.nullable() in case of chunked writing.
   bool output_as_byte_array;   //!< Indicates this list column is being written as a byte array
+  bool skip_compression;       //!< Skip compression for this column
   column_encoding requested_encoding;  //!< User specified encoding for this column.
 };
 
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index fd8d4f8bd7f..823a08084ee 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -274,6 +274,7 @@ struct schema_tree_node : public SchemaElement {
   statistics_dtype stats_dtype;
   int32_t ts_scale;
   column_encoding requested_encoding;
+  bool skip_compression;
 
   // TODO(fut): Think about making schema a class that holds a vector of schema_tree_nodes. The
   // function construct_schema_tree could be its constructor. It can have method to get the per
@@ -698,6 +699,7 @@ std::vector<schema_tree_node> construct_schema_tree(
         set_field_id(col_schema, col_meta);
         set_encoding(col_schema, col_meta);
         col_schema.output_as_byte_array = col_meta.is_enabled_output_as_binary();
+        col_schema.skip_compression     = col_meta.is_enabled_skip_compression();
         schema.push_back(col_schema);
       } else if (col->type().id() == type_id::STRUCT) {
         // if struct, add current and recursively call for all children
@@ -833,6 +835,7 @@ std::vector<schema_tree_node> construct_schema_tree(
         col_schema.leaf_column = col;
         set_field_id(col_schema, col_meta);
         set_encoding(col_schema, col_meta);
+        col_schema.skip_compression = col_meta.is_enabled_skip_compression();
         schema.push_back(col_schema);
       }
     };
@@ -1023,6 +1026,7 @@ parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream
   desc.max_def_level      = _max_def_level;
   desc.max_rep_level      = _max_rep_level;
   desc.requested_encoding = schema_node.requested_encoding;
+  desc.skip_compression   = schema_node.skip_compression;
   return desc;
 }
 
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index ffa672fb564..caddfee9f02 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -24,8 +24,11 @@
 
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/parquet.hpp>
+#include <cudf/io/types.hpp>
 #include <cudf/unary.hpp>
 
+#include <src/io/parquet/parquet_common.hpp>
+
 #include <fstream>
 
 using cudf::test::iterators::no_nulls;
@@ -1321,6 +1324,45 @@ TEST_F(ParquetWriterTest, CompStatsEmptyTable)
   expect_compression_stats_empty(stats);
 }
 
+TEST_F(ParquetWriterTest, SkipCompression)
+{
+  constexpr auto page_rows      = 1000;
+  constexpr auto row_group_rows = 2 * page_rows;
+  constexpr auto num_rows       = 2 * row_group_rows;
+
+  auto sequence = thrust::make_counting_iterator(0);
+  column_wrapper<int> col(sequence, sequence + num_rows, no_nulls());
+
+  auto expected          = table_view{{col, col}};
+  auto expected_metadata = cudf::io::table_input_metadata{expected};
+  expected_metadata.column_metadata[0].set_skip_compression(true);
+
+  auto const filepath = temp_env->get_temp_filepath("SkipCompression.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .compression(cudf::io::compression_type::ZSTD)
+      .max_page_size_rows(page_rows)
+      .row_group_size_rows(row_group_rows)
+      .max_page_fragment_size(page_rows)
+      .metadata(std::move(expected_metadata));
+
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, expected);
+
+  // check metadata to make sure column 0 is not compressed and column 1 is
+  auto const source = cudf::io::datasource::create(filepath);
+  cudf::io::parquet::detail::FileMetaData fmd;
+  read_footer(source, &fmd);
+
+  EXPECT_EQ(fmd.row_groups[0].columns[0].meta_data.codec, cudf::io::parquet::detail::UNCOMPRESSED);
+  EXPECT_EQ(fmd.row_groups[0].columns[1].meta_data.codec, cudf::io::parquet::detail::ZSTD);
+}
+
 TEST_F(ParquetWriterTest, NoNullsAsNonNullable)
 {
   column_wrapper<int32_t> col{{1, 2, 3}, no_nulls()};

From 045f29d1f37ce3fe9dedcfd9ed1141c7a70243ba Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Fri, 19 Apr 2024 10:24:14 -0500
Subject: [PATCH 101/272] Refactor JNI native dependency loading to allow
 returning of library path (#15566)

Adds a method to NativeDepsLoader that allows loading a dependency and determining the temporary file path the dependency was loaded from.  Also refactors the methods to take the dependency preservation flag as a parameter rather than lower-level functions directly reading the global flag, making them more flexible for reuse.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/15566
---
 .../java/ai/rapids/cudf/NativeDepsLoader.java | 84 ++++++++++++++++---
 1 file changed, 73 insertions(+), 11 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java b/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
index 27322cca436..7ee590e3c82 100755
--- a/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
+++ b/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -77,7 +77,7 @@ public class NativeDepsLoader {
   public static synchronized void loadNativeDeps() {
     if (!loaded) {
       try {
-        loadNativeDeps(loadOrder);
+        loadNativeDeps(loadOrder, preserveDepsAfterLoad);
         loaded = true;
       } catch (Throwable t) {
         log.error("Could not load cudf jni library...", t);
@@ -122,11 +122,53 @@ public static synchronized void loadNativeDeps() {
    * @throws IOException on any error trying to load the libraries.
    */
   public static void loadNativeDeps(String[] loadOrder) throws IOException {
+    loadNativeDeps(loadOrder, preserveDepsAfterLoad);
+  }
+
+  /**
+   * Allows other libraries to reuse the same native deps loading logic. Libraries will be searched
+   * for under ${os.arch}/${os.name}/ in the class path using the class loader for this class.
+   * <br/>
+   * Because this just loads the libraries and loading the libraries themselves needs to be a
+   * singleton operation it is recommended that any library using this provide their own wrapper
+   * function similar to
+   * <pre>
+   *     private static boolean loaded = false;
+   *     static synchronized void loadNativeDeps() {
+   *         if (!loaded) {
+   *             try {
+   *                 // If you also depend on the cudf liobrary being loaded, be sure it is loaded
+   *                 // first
+   *                 ai.rapids.cudf.NativeDepsLoader.loadNativeDeps();
+   *                 ai.rapids.cudf.NativeDepsLoader.loadNativeDeps(new String[]{...});
+   *                 loaded = true;
+   *             } catch (Throwable t) {
+   *                 log.error("Could not load ...", t);
+   *             }
+   *         }
+   *     }
+   * </pre>
+   * This function should be called from the static initialization block of any class that uses
+   * JNI. For example
+   * <pre>
+   *     public class UsesJNI {
+   *         static {
+   *             MyNativeDepsLoader.loadNativeDeps();
+   *         }
+   *     }
+   * </pre>
+   * @param loadOrder the base name of the libraries. For example libfoo.so would be passed in as
+   *                  "foo".  The libraries are loaded in the order provided.
+   * @param preserveDeps if false the dependencies will be deleted immediately after loading
+   *                     rather than on exit.
+   * @throws IOException on any error trying to load the libraries.
+   */
+  public static void loadNativeDeps(String[] loadOrder, boolean preserveDeps) throws IOException {
     String os = System.getProperty("os.name");
     String arch = System.getProperty("os.arch");
 
     for (String toLoad : loadOrder) {
-      loadDep(os, arch, toLoad);
+      loadDep(os, arch, toLoad, preserveDeps);
     }
   }
 
@@ -134,9 +176,11 @@ public static void loadNativeDeps(String[] loadOrder) throws IOException {
    * Load native dependencies in stages, where the dependency libraries in each stage
    * are loaded only after all libraries in earlier stages have completed loading.
    * @param loadOrder array of stages with an array of dependency library names in each stage
+   * @param preserveDeps if false the dependencies will be deleted immediately after loading
+   *                     rather than on exit.
    * @throws IOException on any error trying to load the libraries
    */
-  private static void loadNativeDeps(String[][] loadOrder) throws IOException {
+  private static void loadNativeDeps(String[][] loadOrder, boolean preserveDeps) throws IOException {
     String os = System.getProperty("os.name");
     String arch = System.getProperty("os.arch");
 
@@ -161,7 +205,7 @@ private static void loadNativeDeps(String[][] loadOrder) throws IOException {
       // Submit all dependencies in the stage to be loaded in parallel
       loadCompletionFutures.clear();
       for (Future<File> fileFuture : stageFileFutures) {
-        loadCompletionFutures.add(executor.submit(() -> loadDep(fileFuture)));
+        loadCompletionFutures.add(executor.submit(() -> loadDep(fileFuture, preserveDeps)));
       }
 
       // Wait for all dependencies in this stage to have been loaded
@@ -177,28 +221,46 @@ private static void loadNativeDeps(String[][] loadOrder) throws IOException {
     executor.shutdownNow();
   }
 
-  private static void loadDep(String os, String arch, String baseName) throws IOException {
+  /**
+   * Allows other libraries to reuse the same native deps loading logic. Library will be searched
+   * for under ${os.arch}/${os.name}/ in the class path using the class loader for this class.
+   * @param depName the base name of the library. For example libfoo.so would be passed in as
+   *                "foo".  The libraries are loaded in the order provided.
+   * @param preserveDep if false the dependencies will be deleted immediately after loading
+   *                    rather than on exit.
+   * @return path where the dependency was loaded
+   * @throws IOException on any error trying to load the libraries.
+   */
+  public static File loadNativeDep(String depName, boolean preserveDep) throws IOException {
+    String os = System.getProperty("os.name");
+    String arch = System.getProperty("os.arch");
+    return loadDep(os, arch, depName, preserveDep);
+  }
+
+  private static File loadDep(String os, String arch, String baseName, boolean preserveDep)
+      throws IOException {
     File path = createFile(os, arch, baseName);
-    loadDep(path);
+    loadDep(path, preserveDep);
+    return path;
   }
 
   /** Load a library at the specified path */
-  private static void loadDep(File path) {
+  private static void loadDep(File path, boolean preserveDep) {
     System.load(path.getAbsolutePath());
-    if (!preserveDepsAfterLoad) {
+    if (!preserveDep) {
       path.delete();
     }
   }
 
   /** Load a library, waiting for the specified future to produce the path before loading */
-  private static void loadDep(Future<File> fileFuture) {
+  private static void loadDep(Future<File> fileFuture, boolean preserveDep) {
     File path;
     try {
       path = fileFuture.get();
     } catch (ExecutionException | InterruptedException e) {
       throw new RuntimeException("Error loading dependencies", e);
     }
-    loadDep(path);
+    loadDep(path, preserveDep);
   }
 
   /** Extract the contents of a library resource into a temporary file */

From 088be5aecee1f2bc00f7d4acdb095894b3defcb7 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 19 Apr 2024 11:26:12 -0500
Subject: [PATCH 102/272] Rename experimental JSON tests. (#15568)

This PR renames the "experimental" JSON reader tests. These are now production grade and not experimental.

This task is tracked in https://github.com/rapidsai/cudf/issues/15537.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/15568
---
 cpp/tests/io/json_test.cpp        | 20 ++++++++++----------
 cpp/tests/io/nested_json_test.cpp |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 81cedf3d23e..ee1207f04a2 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -170,14 +170,14 @@ struct JsonReaderTest : public cudf::test::BaseFixture {};
  */
 enum class json_test_t {
   // Run test with the nested JSON lines reader using record-orient input data
-  json_experimental_record_orient,
+  json_record_orient,
   // Run test with the nested JSON lines reader using row-orient input data
-  json_experimental_row_orient
+  json_row_orient
 };
 
 constexpr bool is_row_orient_test(json_test_t test_opt)
 {
-  return test_opt == json_test_t::json_experimental_row_orient;
+  return test_opt == json_test_t::json_row_orient;
 }
 
 /**
@@ -267,13 +267,13 @@ TYPED_TEST_SUITE(JsonValidFixedPointReaderTest, cudf::test::FixedPointTypes);
 // Parametrize qualifying JSON tests for executing both nested reader and legacy JSON lines reader
 INSTANTIATE_TEST_CASE_P(JsonReaderParamTest,
                         JsonReaderParamTest,
-                        ::testing::Values(json_test_t::json_experimental_record_orient,
-                                          json_test_t::json_experimental_row_orient));
+                        ::testing::Values(json_test_t::json_record_orient,
+                                          json_test_t::json_row_orient));
 
 // Parametrize qualifying JSON tests for executing both nested reader and legacy JSON lines reader
 INSTANTIATE_TEST_CASE_P(JsonReaderRecordTest,
                         JsonReaderRecordTest,
-                        ::testing::Values(json_test_t::json_experimental_record_orient));
+                        ::testing::Values(json_test_t::json_record_orient));
 
 TEST_P(JsonReaderParamTest, BasicJsonLines)
 {
@@ -1223,9 +1223,9 @@ TEST_F(JsonReaderTest, DISABLED_BadDtypeParams)
   EXPECT_THROW(cudf::io::read_json(options_map), cudf::logic_error);
 }
 
-TEST_F(JsonReaderTest, JsonExperimentalBasic)
+TEST_F(JsonReaderTest, JsonBasic)
 {
-  std::string const fname = temp_env->get_temp_dir() + "JsonExperimentalBasic.json";
+  std::string const fname = temp_env->get_temp_dir() + "JsonBasic.json";
   std::ofstream outfile(fname, std::ofstream::out);
   outfile << R"([{"a":"11", "b":"1.1"},{"a":"22", "b":"2.2"}])";
   outfile.close();
@@ -1249,7 +1249,7 @@ TEST_F(JsonReaderTest, JsonExperimentalBasic)
                                  cudf::test::strings_column_wrapper({"1.1", "2.2"}));
 }
 
-TEST_F(JsonReaderTest, JsonExperimentalLines)
+TEST_F(JsonReaderTest, JsonLines)
 {
   std::string const json_string =
     R"({"a":"a0"}
@@ -1416,7 +1416,7 @@ TEST_F(JsonReaderTest, TokenAllocation)
   }
 }
 
-TEST_F(JsonReaderTest, ExperimentalLinesNoOmissions)
+TEST_F(JsonReaderTest, LinesNoOmissions)
 {
   std::array<std::string const, 4> const json_inputs
     // single column
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp
index 97e1a78f909..2e2d5cae34c 100644
--- a/cpp/tests/io/nested_json_test.cpp
+++ b/cpp/tests/io/nested_json_test.cpp
@@ -621,7 +621,7 @@ TEST_F(JsonTest, TokenStream2)
 }
 
 struct JsonParserTest : public cudf::test::BaseFixture, public testing::WithParamInterface<bool> {};
-INSTANTIATE_TEST_SUITE_P(Experimental, JsonParserTest, testing::Bool());
+INSTANTIATE_TEST_SUITE_P(IsFullGPU, JsonParserTest, testing::Bool());
 
 TEST_P(JsonParserTest, ExtractColumn)
 {

From 21350fc2ac070315d110fca55cb6781ed7905596 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Apr 2024 07:17:09 -1000
Subject: [PATCH 103/272] Allow apply udf to reference global modules in
 cudf.pandas (#15569)

closes #15548

`_replace_closurevars` creates a new function by replacing objects with their fast versions. When creating the new function, it populates `globals` from the result of `inspect.getclosurevars`, but it don't think it comprehensively returns _all_ the globals accessible to the function (`function.__globals__`)

To minimize the change, the "fast globals" are still sourced from `inspect.getclosurevars`, and those update the `old_function.__globals__` when creating a new function.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15569
---
 python/cudf/cudf/pandas/fast_slow_proxy.py        |  9 ++++++---
 python/cudf/cudf_pandas_tests/test_cudf_pandas.py | 12 ++++++++++++
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index e811ba1351a..9d8c174b297 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -1108,7 +1108,7 @@ def _replace_closurevars(
         if any(c == types.CellType() for c in f.__closure__):
             return f
 
-    f_nonlocals, f_globals, f_builtins, _ = inspect.getclosurevars(f)
+    f_nonlocals, f_globals, _, _ = inspect.getclosurevars(f)
 
     g_globals = _transform_arg(f_globals, attribute_name, seen)
     g_nonlocals = _transform_arg(f_nonlocals, attribute_name, seen)
@@ -1121,11 +1121,14 @@ def _replace_closurevars(
         return f
 
     g_closure = tuple(types.CellType(val) for val in g_nonlocals.values())
-    g_globals["__builtins__"] = f_builtins
+
+    # https://github.com/rapidsai/cudf/issues/15548
+    new_g_globals = f.__globals__.copy()
+    new_g_globals.update(g_globals)
 
     g = types.FunctionType(
         f.__code__,
-        g_globals,
+        new_g_globals,
         name=f.__name__,
         argdefs=f.__defaults__,
         closure=g_closure,
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index f017b46866f..90356a01404 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1208,3 +1208,15 @@ def test_pickle_groupby(dataframe):
 def test_isinstance_base_offset():
     offset = xpd.tseries.frequencies.to_offset("1s")
     assert isinstance(offset, xpd.tseries.offsets.BaseOffset)
+
+
+def test_apply_slow_path_udf_references_global_module():
+    def my_apply(df, unused):
+        # `datetime` Raised `KeyError: __import__`
+        datetime.datetime.strptime(df["Minute"], "%H:%M:%S")
+        return pd.to_numeric(1)
+
+    df = xpd.DataFrame({"Minute": ["09:00:00"]})
+    result = df.apply(my_apply, axis=1, unused=True)
+    expected = xpd.Series([1])
+    tm.assert_series_equal(result, expected)

From 40d3dd7681b22103457e0e3d511d1f6860e28d77 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 19 Apr 2024 14:21:04 -0700
Subject: [PATCH 104/272] Ignore new cupy warning (#15574)

cupy 13.1 added a warning about the jitify cache warming up that we must silence in our test suite for it to pass.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - https://github.com/brandon-b-miller
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15574
---
 python/cudf/cudf/tests/pytest.ini | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini
index 36ccb434bb2..710473acb85 100644
--- a/python/cudf/cudf/tests/pytest.ini
+++ b/python/cudf/cudf/tests/pytest.ini
@@ -8,5 +8,7 @@ filterwarnings =
     error
     ignore:::.*xdist.*
     ignore:::.*pytest.*
+    # Deprecation warning from Pyarrow Table.to_pandas() with pandas-2.2+
     ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning
-    # Above deprecation warning comes from Pyarrow Table.to_pandas() with pandas-2.2+
+    # PerformanceWarning from cupy warming up the JIT cache
+    ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning

From d37636dc8d17571bd3e7a17e2da9d26f99b5490d Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 19 Apr 2024 18:59:54 -0500
Subject: [PATCH 105/272] Remove protobuf and use parsed ORC statistics from
 libcudf (#15564)

This PR removes the cuDF Python dependencies on `protobuf` and `protoc-wheel`. Closes #15511.

The only use case for the `protobuf` dependency was reading ORC file/stripe statistics. However, we have code in libcudf that can do this without requiring `protobuf`.

In this PR, we expose the C++ code for parsing ORC statistics from libcudf to Cython and remove all references to `protobuf`.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Jake Awe (https://github.com/AyodeAwe)
  - David Wendt (https://github.com/davidwendt)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15564
---
 .gitignore                                    |   3 -
 .../all_cuda-118_arch-x86_64.yaml             |   1 -
 .../all_cuda-122_arch-x86_64.yaml             |   1 -
 conda/recipes/cudf/meta.yaml                  |   2 -
 cpp/include/cudf/io/orc_metadata.hpp          |  26 +--
 dependencies.yaml                             |   5 -
 pyproject.toml                                |   1 -
 python/cudf/CMakeLists.txt                    |   3 -
 .../cudf/cmake/Modules/ProtobufHelpers.cmake  |  50 ------
 python/cudf/cudf/_lib/cpp/io/orc_metadata.pxd |  64 ++++++-
 python/cudf/cudf/_lib/orc.pyx                 | 161 +++++++++++++++++-
 python/cudf/cudf/_lib/variant.pxd             |  15 ++
 python/cudf/cudf/io/orc.py                    |  48 +++---
 python/cudf/cudf/utils/metadata/__init__.py   |   1 -
 .../metadata/orc_column_statistics.proto      |  62 -------
 python/cudf/pyproject.toml                    |   2 -
 16 files changed, 263 insertions(+), 182 deletions(-)
 delete mode 100644 python/cudf/cmake/Modules/ProtobufHelpers.cmake
 create mode 100644 python/cudf/cudf/_lib/variant.pxd
 delete mode 100644 python/cudf/cudf/utils/metadata/__init__.py
 delete mode 100644 python/cudf/cudf/utils/metadata/orc_column_statistics.proto

diff --git a/.gitignore b/.gitignore
index 313bb1c3789..c89fb49697a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -161,9 +161,6 @@ ENV/
 # Dask
 dask-worker-space/
 
-# protobuf
-**/*_pb2.py
-
 # Sphinx docs & build artifacts
 docs/cudf/source/api_docs/generated/*
 docs/cudf/source/user_guide/api_docs/api/*
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index ef971d10f19..e8816da3a2a 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -68,7 +68,6 @@ dependencies:
 - pandoc
 - pip
 - pre-commit
-- protobuf>=3.20,<5
 - ptxcompiler
 - pyarrow==14.0.2.*
 - pydata-sphinx-theme!=0.14.2
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 688e41ec1ba..8044fc35a19 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -66,7 +66,6 @@ dependencies:
 - pandoc
 - pip
 - pre-commit
-- protobuf>=3.20,<5
 - pyarrow==14.0.2.*
 - pydata-sphinx-theme!=0.14.2
 - pynvjitlink
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 5512ef11057..ae2d938250b 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -59,7 +59,6 @@ requirements:
     - cuda-version ={{ cuda_version }}
     - sysroot_{{ target_platform }} {{ sysroot_version }}
   host:
-    - protobuf ==4.24.*
     - python
     - cython >=3.0.3
     - scikit-build-core >=0.7.0
@@ -78,7 +77,6 @@ requirements:
     {% endif %}
     - cuda-version ={{ cuda_version }}
   run:
-    - protobuf >=3.20,<5.0a0
     - python
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.3dev0
diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp
index 8f3eb1dff3c..35196a19349 100644
--- a/cpp/include/cudf/io/orc_metadata.hpp
+++ b/cpp/include/cudf/io/orc_metadata.hpp
@@ -154,6 +154,21 @@ struct timestamp_statistics : minmax_statistics<int64_t> {
   std::optional<uint32_t> maximum_nanos;  ///< nanoseconds part of the maximum
 };
 
+/**
+ * @brief Variant type for ORC type-specific column statistics.
+ *
+ * The variant can hold any of the supported column statistics types.
+ */
+using statistics_type = std::variant<no_statistics,
+                                     integer_statistics,
+                                     double_statistics,
+                                     string_statistics,
+                                     bucket_statistics,
+                                     decimal_statistics,
+                                     date_statistics,
+                                     binary_statistics,
+                                     timestamp_statistics>;
+
 //! Orc I/O interfaces
 namespace orc {
 // forward declare the type that ProtobufReader uses. The `cudf::io::column_statistics` objects,
@@ -171,16 +186,7 @@ struct column_statistics;
 struct column_statistics {
   std::optional<uint64_t> number_of_values;  ///< number of statistics
   std::optional<bool> has_null;              ///< column has any nulls
-  std::variant<no_statistics,
-               integer_statistics,
-               double_statistics,
-               string_statistics,
-               bucket_statistics,
-               decimal_statistics,
-               date_statistics,
-               binary_statistics,
-               timestamp_statistics>
-    type_specific_stats;  ///< type-specific statistics
+  statistics_type type_specific_stats;       ///< type-specific statistics
 
   /**
    * @brief Construct a new column statistics object
diff --git a/dependencies.yaml b/dependencies.yaml
index 147a89076c4..2ed2525fc1e 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -286,13 +286,9 @@ dependencies:
       - output_types: conda
         packages:
           - &rmm_conda rmm==24.6.*
-          - &protobuf protobuf>=3.20,<5
           - pip
           - pip:
               - git+https://github.com/python-streamz/streamz.git@master
-      - output_types: [requirements, pyproject]
-        packages:
-          - protoc-wheel
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -525,7 +521,6 @@ dependencies:
           - packaging
           - rich
           - typing_extensions>=4.0.0
-          - *protobuf
       - output_types: conda
         packages:
           - *rmm_conda
diff --git a/pyproject.toml b/pyproject.toml
index 797b5374cb6..d343b237ee7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,6 @@ follow_imports = "skip"
 exclude = [
     "cudf/_lib/",
     "cudf/cudf/tests/",
-    "cudf/cudf/utils/metadata/orc_column_statistics_pb2.py",
     "custreamz/custreamz/tests/",
     "dask_cudf/dask_cudf/tests/",
  ]
diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt
index 23edbbc636c..ecadbf5cbbc 100644
--- a/python/cudf/CMakeLists.txt
+++ b/python/cudf/CMakeLists.txt
@@ -96,9 +96,6 @@ include(cmake/Modules/LinkPyarrowHeaders.cmake)
 add_subdirectory(cudf/_lib)
 add_subdirectory(udf_cpp)
 
-include(cmake/Modules/ProtobufHelpers.cmake)
-codegen_protoc(cudf/utils/metadata/orc_column_statistics.proto)
-
 if(DEFINED cython_lib_dir)
   rapids_cython_add_rpath_entries(TARGET cudf PATHS "${cython_lib_dir}")
 endif()
diff --git a/python/cudf/cmake/Modules/ProtobufHelpers.cmake b/python/cudf/cmake/Modules/ProtobufHelpers.cmake
deleted file mode 100644
index 70b8879cf18..00000000000
--- a/python/cudf/cmake/Modules/ProtobufHelpers.cmake
+++ /dev/null
@@ -1,50 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-include_guard(GLOBAL)
-
-# Compile protobuf files to Python. All arguments are assumed to be .proto files.
-function(codegen_protoc)
-  # Allow user to provide path to protoc executable as an environment variable.
-  if(DEFINED ENV{PROTOC})
-    set(protoc_COMMAND $ENV{PROTOC})
-  else()
-    find_program(protoc_COMMAND protoc REQUIRED)
-  endif()
-
-  foreach(_proto_path IN LISTS ARGV)
-    string(REPLACE "\.proto" "_pb2\.py" pb2_py_path "${_proto_path}")
-    set(pb2_py_path "${CMAKE_CURRENT_SOURCE_DIR}/${pb2_py_path}")
-    # Note: If we ever need to process larger numbers of protobuf files we should consider switching
-    # to protobuf_generate_python from the FindProtobuf module.
-    execute_process(
-      COMMAND ${protoc_COMMAND} --python_out=. "${_proto_path}"
-      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY
-    )
-    # Mark entire file to skip formatting.
-    file(READ "${pb2_py_path}" pb2_py)
-    file(
-      WRITE "${pb2_py_path}"
-      [=[
-# fmt: off
-]=]
-    )
-    file(APPEND "${pb2_py_path}" "${pb2_py}")
-    file(
-      APPEND "${pb2_py_path}"
-      [=[
-# fmt: on
-]=]
-    )
-  endforeach()
-endfunction()
diff --git a/python/cudf/cudf/_lib/cpp/io/orc_metadata.pxd b/python/cudf/cudf/_lib/cpp/io/orc_metadata.pxd
index 57be1b1c90c..aad4f1c6870 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc_metadata.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc_metadata.pxd
@@ -1,19 +1,73 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+from libc.stdint cimport int32_t, int64_t, uint32_t, uint64_t
+from libcpp cimport bool
+from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
+from cudf._lib.variant cimport monostate, variant
 
 
 cdef extern from "cudf/io/orc_metadata.hpp" \
         namespace "cudf::io" nogil:
 
-    cdef cppclass raw_orc_statistics:
+    ctypedef monostate no_statistics
+
+    cdef cppclass minmax_statistics[T]:
+        optional[T] minimum
+        optional[T] maximum
+
+    cdef cppclass sum_statistics[T]:
+        optional[T] sum
+
+    cdef cppclass integer_statistics(
+        minmax_statistics[int64_t], sum_statistics[int64_t]
+    ):
+        pass
+
+    cdef cppclass double_statistics(
+        minmax_statistics[double], sum_statistics[double]
+    ):
+        pass
+
+    cdef cppclass string_statistics(
+        minmax_statistics[string], sum_statistics[int64_t]
+    ):
+        pass
+
+    cdef cppclass bucket_statistics:
+        vector[int64_t] count
+
+    cdef cppclass decimal_statistics(
+        minmax_statistics[string], sum_statistics[string]
+    ):
+        pass
+
+    ctypedef minmax_statistics[int32_t] date_statistics
+
+    ctypedef sum_statistics[int64_t] binary_statistics
+
+    cdef cppclass timestamp_statistics(minmax_statistics[int64_t]):
+        optional[int64_t] minimum_utc
+        optional[int64_t] maximum_utc
+        optional[uint32_t] minimum_nanos
+        optional[uint32_t] maximum_nanos
+
+    # This is a std::variant of all the statistics types
+    ctypedef variant statistics_type
+
+    cdef cppclass column_statistics:
+        optional[uint64_t] number_of_values
+        optional[bool] has_null
+        statistics_type type_specific_stats
+
+    cdef cppclass parsed_orc_statistics:
         vector[string] column_names
-        vector[string] file_stats
-        vector[vector[string]] stripes_stats
+        vector[column_statistics] file_stats
+        vector[vector[column_statistics]] stripes_stats
 
-    cdef raw_orc_statistics read_raw_orc_statistics(
+    cdef parsed_orc_statistics read_parsed_orc_statistics(
         cudf_io_types.source_info src_info
     ) except +
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 3fc9823b914..836880a6f2c 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -11,6 +11,7 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
+import datetime
 from collections import OrderedDict
 
 cimport cudf._lib.cpp.lists.lists_column_view as cpp_lists_column_view
@@ -32,8 +33,19 @@ from cudf._lib.cpp.io.orc cimport (
     write_orc as libcudf_write_orc,
 )
 from cudf._lib.cpp.io.orc_metadata cimport (
-    raw_orc_statistics,
-    read_raw_orc_statistics as libcudf_read_raw_orc_statistics,
+    binary_statistics,
+    bucket_statistics,
+    column_statistics,
+    date_statistics,
+    decimal_statistics,
+    double_statistics,
+    integer_statistics,
+    no_statistics,
+    parsed_orc_statistics,
+    read_parsed_orc_statistics as libcudf_read_parsed_orc_statistics,
+    statistics_type,
+    string_statistics,
+    timestamp_statistics,
 )
 from cudf._lib.cpp.io.types cimport (
     column_in_metadata,
@@ -51,6 +63,7 @@ from cudf._lib.io.utils cimport (
     make_source_info,
     update_column_struct_field_names,
 )
+from cudf._lib.variant cimport get_if as std_get_if, holds_alternative
 
 from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 
@@ -62,9 +75,128 @@ from pyarrow.lib import NativeFile
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 
 
-cpdef read_raw_orc_statistics(filepath_or_buffer):
+cdef _parse_column_type_statistics(column_statistics stats):
+    # Initialize stats to return and parse stats blob
+    column_stats = {}
+
+    if stats.number_of_values.has_value():
+        column_stats["number_of_values"] = stats.number_of_values.value()
+
+    if stats.has_null.has_value():
+        column_stats["has_null"] = stats.has_null.value()
+
+    cdef statistics_type type_specific_stats = stats.type_specific_stats
+
+    cdef integer_statistics* int_stats
+    cdef double_statistics* dbl_stats
+    cdef string_statistics* str_stats
+    cdef bucket_statistics* bucket_stats
+    cdef decimal_statistics* dec_stats
+    cdef date_statistics* date_stats
+    cdef binary_statistics* bin_stats
+    cdef timestamp_statistics* ts_stats
+
+    if holds_alternative[no_statistics](type_specific_stats):
+        return column_stats
+    elif int_stats := std_get_if[integer_statistics](&type_specific_stats):
+        if int_stats.minimum.has_value():
+            column_stats["minimum"] = int_stats.minimum.value()
+        else:
+            column_stats["minimum"] = None
+        if int_stats.maximum.has_value():
+            column_stats["maximum"] = int_stats.maximum.value()
+        else:
+            column_stats["maximum"] = None
+        if int_stats.sum.has_value():
+            column_stats["sum"] = int_stats.sum.value()
+        else:
+            column_stats["sum"] = None
+    elif dbl_stats := std_get_if[double_statistics](&type_specific_stats):
+        if dbl_stats.minimum.has_value():
+            column_stats["minimum"] = dbl_stats.minimum.value()
+        else:
+            column_stats["minimum"] = None
+        if dbl_stats.maximum.has_value():
+            column_stats["maximum"] = dbl_stats.maximum.value()
+        else:
+            column_stats["maximum"] = None
+        if dbl_stats.sum.has_value():
+            column_stats["sum"] = dbl_stats.sum.value()
+        else:
+            column_stats["sum"] = None
+    elif str_stats := std_get_if[string_statistics](&type_specific_stats):
+        if str_stats.minimum.has_value():
+            column_stats["minimum"] = str_stats.minimum.value().decode("utf-8")
+        else:
+            column_stats["minimum"] = None
+        if str_stats.maximum.has_value():
+            column_stats["maximum"] = str_stats.maximum.value().decode("utf-8")
+        else:
+            column_stats["maximum"] = None
+        if str_stats.sum.has_value():
+            column_stats["sum"] = str_stats.sum.value()
+        else:
+            column_stats["sum"] = None
+    elif bucket_stats := std_get_if[bucket_statistics](&type_specific_stats):
+        column_stats["true_count"] = bucket_stats.count[0]
+        column_stats["false_count"] = (
+            column_stats["number_of_values"]
+            - column_stats["true_count"]
+        )
+    elif dec_stats := std_get_if[decimal_statistics](&type_specific_stats):
+        if dec_stats.minimum.has_value():
+            column_stats["minimum"] = dec_stats.minimum.value().decode("utf-8")
+        else:
+            column_stats["minimum"] = None
+        if dec_stats.maximum.has_value():
+            column_stats["maximum"] = dec_stats.maximum.value().decode("utf-8")
+        else:
+            column_stats["maximum"] = None
+        if dec_stats.sum.has_value():
+            column_stats["sum"] = dec_stats.sum.value().decode("utf-8")
+        else:
+            column_stats["sum"] = None
+    elif date_stats := std_get_if[date_statistics](&type_specific_stats):
+        if date_stats.minimum.has_value():
+            column_stats["minimum"] = datetime.datetime.fromtimestamp(
+                datetime.timedelta(date_stats.minimum.value()).total_seconds(),
+                datetime.timezone.utc,
+            )
+        else:
+            column_stats["minimum"] = None
+        if date_stats.maximum.has_value():
+            column_stats["maximum"] = datetime.datetime.fromtimestamp(
+                datetime.timedelta(date_stats.maximum.value()).total_seconds(),
+                datetime.timezone.utc,
+            )
+        else:
+            column_stats["maximum"] = None
+    elif bin_stats := std_get_if[binary_statistics](&type_specific_stats):
+        if bin_stats.sum.has_value():
+            column_stats["sum"] = bin_stats.sum.value()
+        else:
+            column_stats["sum"] = None
+    elif ts_stats := std_get_if[timestamp_statistics](&type_specific_stats):
+        # Before ORC-135, the local timezone offset was included and they were
+        # stored as minimum and maximum. After ORC-135, the timestamp is
+        # adjusted to UTC before being converted to milliseconds and stored
+        # in minimumUtc and maximumUtc.
+        # TODO: Support minimum and maximum by reading writer's local timezone
+        if ts_stats.minimum_utc.has_value() and ts_stats.maximum_utc.has_value():
+            column_stats["minimum"] = datetime.datetime.fromtimestamp(
+                ts_stats.minimum_utc.value() / 1000, datetime.timezone.utc
+            )
+            column_stats["maximum"] = datetime.datetime.fromtimestamp(
+                ts_stats.maximum_utc.value() / 1000, datetime.timezone.utc
+            )
+    else:
+        raise ValueError("Unsupported statistics type")
+    return column_stats
+
+
+cpdef read_parsed_orc_statistics(filepath_or_buffer):
     """
-    Cython function to call into libcudf API, see `read_raw_orc_statistics`.
+    Cython function to call into libcudf API, see `read_parsed_orc_statistics`.
 
     See Also
     --------
@@ -75,10 +207,25 @@ cpdef read_raw_orc_statistics(filepath_or_buffer):
     if isinstance(filepath_or_buffer, NativeFile):
         filepath_or_buffer = NativeFileDatasource(filepath_or_buffer)
 
-    cdef raw_orc_statistics raw = (
-        libcudf_read_raw_orc_statistics(make_source_info([filepath_or_buffer]))
+    cdef parsed_orc_statistics parsed = (
+        libcudf_read_parsed_orc_statistics(make_source_info([filepath_or_buffer]))
     )
-    return (raw.column_names, raw.file_stats, raw.stripes_stats)
+
+    cdef vector[column_statistics] file_stats = parsed.file_stats
+    cdef vector[vector[column_statistics]] stripes_stats = parsed.stripes_stats
+
+    parsed_file_stats = [
+        _parse_column_type_statistics(file_stats[column_index])
+        for column_index in range(file_stats.size())
+    ]
+
+    parsed_stripes_stats = [
+        [_parse_column_type_statistics(stripes_stats[stripe_index][column_index])
+         for column_index in range(stripes_stats[stripe_index].size())]
+        for stripe_index in range(stripes_stats.size())
+    ]
+
+    return parsed.column_names, parsed_file_stats, parsed_stripes_stats
 
 
 cpdef read_orc(object filepaths_or_buffers,
diff --git a/python/cudf/cudf/_lib/variant.pxd b/python/cudf/cudf/_lib/variant.pxd
new file mode 100644
index 00000000000..f686bf18bf7
--- /dev/null
+++ b/python/cudf/cudf/_lib/variant.pxd
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+
+
+cdef extern from "<variant>" namespace "std" nogil:
+    cdef cppclass variant:
+        variant& operator=(variant&)
+        size_t index()
+
+    cdef cppclass monostate:
+        pass
+
+    cdef T* get_if[T](...)
+    cdef bool holds_alternative[T](...)
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index d135a31438e..7082a85237a 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import datetime
 import warnings
@@ -10,9 +10,6 @@
 from cudf._lib import orc as liborc
 from cudf.api.types import is_list_like
 from cudf.utils import ioutils
-from cudf.utils.metadata import (  # type: ignore
-    orc_column_statistics_pb2 as cs_pb2,
-)
 
 
 def _make_empty_df(filepath_or_buffer, columns):
@@ -173,45 +170,38 @@ def read_orc_statistics(
     files_statistics = []
     stripes_statistics = []
     for source in filepaths_or_buffers:
-        path_or_buf, compression = ioutils.get_reader_filepath_or_buffer(
+        path_or_buf, _ = ioutils.get_reader_filepath_or_buffer(
             path_or_data=source, compression=None, **kwargs
         )
-        if compression is not None:
-            ValueError("URL content-encoding decompression is not supported")
-
-        # Read in statistics and unpack
         (
             column_names,
-            raw_file_statistics,
-            raw_stripes_statistics,
-        ) = liborc.read_raw_orc_statistics(path_or_buf)
+            parsed_file_statistics,
+            parsed_stripes_statistics,
+        ) = liborc.read_parsed_orc_statistics(path_or_buf)
 
         # Parse column names
         column_names = [
             column_name.decode("utf-8") for column_name in column_names
         ]
 
-        # Parse statistics
-        cs = cs_pb2.ColumnStatistics()
-
+        # Parse file statistics
         file_statistics = {
-            column_names[i]: _parse_column_statistics(cs, raw_file_stats)
-            for i, raw_file_stats in enumerate(raw_file_statistics)
-            if columns is None or column_names[i] in columns
+            column_name: column_stats
+            for column_name, column_stats in zip(
+                column_names, parsed_file_statistics
+            )
+            if columns is None or column_name in columns
         }
-        if any(
-            not parsed_statistics
-            for parsed_statistics in file_statistics.values()
-        ):
-            continue
-        else:
-            files_statistics.append(file_statistics)
+        files_statistics.append(file_statistics)
 
-        for raw_stripe_statistics in raw_stripes_statistics:
+        # Parse stripe statistics
+        for parsed_stripe_statistics in parsed_stripes_statistics:
             stripe_statistics = {
-                column_names[i]: _parse_column_statistics(cs, raw_file_stats)
-                for i, raw_file_stats in enumerate(raw_stripe_statistics)
-                if columns is None or column_names[i] in columns
+                column_name: column_stats
+                for column_name, column_stats in zip(
+                    column_names, parsed_stripe_statistics
+                )
+                if columns is None or column_name in columns
             }
             if any(
                 not parsed_statistics
diff --git a/python/cudf/cudf/utils/metadata/__init__.py b/python/cudf/cudf/utils/metadata/__init__.py
deleted file mode 100644
index ccbb16256fb..00000000000
--- a/python/cudf/cudf/utils/metadata/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
diff --git a/python/cudf/cudf/utils/metadata/orc_column_statistics.proto b/python/cudf/cudf/utils/metadata/orc_column_statistics.proto
deleted file mode 100644
index 1bc0fa6f6bd..00000000000
--- a/python/cudf/cudf/utils/metadata/orc_column_statistics.proto
+++ /dev/null
@@ -1,62 +0,0 @@
-syntax = "proto2";
-
-message IntegerStatistics  {
-  optional sint64 minimum = 1;
-  optional sint64 maximum = 2;
-  optional sint64 sum = 3;
-}
-
-message DoubleStatistics {
-  optional double minimum = 1;
-  optional double maximum = 2;
-  optional double sum = 3;
-}
-
-message StringStatistics {
-  optional string minimum = 1;
-  optional string maximum = 2;
-  // sum will store the total length of all strings in a stripe
-  optional sint64 sum = 3;
-}
-
-message BucketStatistics {
-  repeated uint64 count = 1 [packed=true];
-}
-
-message DecimalStatistics {
-  optional string minimum = 1;
-  optional string maximum = 2;
-  optional string sum = 3;
-}
-
-message DateStatistics {
-  // min,max values saved as days since epoch
-  optional sint32 minimum = 1;
-  optional sint32 maximum = 2;
-}
-
-message TimestampStatistics {
-  // min,max values saved as milliseconds since epoch
-  optional sint64 minimum = 1;
-  optional sint64 maximum = 2;
-  optional sint64 minimumUtc = 3;
-  optional sint64 maximumUtc = 4;
-}
-
-message BinaryStatistics {
-  // sum will store the total binary blob length in a stripe
-  optional sint64 sum = 1;
-}
-
-message ColumnStatistics {
-  optional uint64 numberOfValues = 1;
-  optional IntegerStatistics intStatistics = 2;
-  optional DoubleStatistics doubleStatistics = 3;
-  optional StringStatistics stringStatistics = 4;
-  optional BucketStatistics bucketStatistics = 5;
-  optional DecimalStatistics decimalStatistics = 6;
-  optional DateStatistics dateStatistics = 7;
-  optional BinaryStatistics binaryStatistics = 8;
-  optional TimestampStatistics timestampStatistics = 9;
-  optional bool hasNull = 10;
-}
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index adab199dcf4..fc3a243572f 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -7,7 +7,6 @@ requires = [
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",
-    "protoc-wheel",
     "pyarrow==14.0.2.*",
     "rmm==24.6.*",
     "scikit-build-core[pyproject]>=0.7.0",
@@ -34,7 +33,6 @@ dependencies = [
     "nvtx>=0.2.1",
     "packaging",
     "pandas>=2.0,<2.2.3dev0",
-    "protobuf>=3.20,<5",
     "ptxcompiler",
     "pyarrow>=14.0.1,<15.0.0a0",
     "rich",

From 96903bb99476f876bbcadbdb50f8a9e9b80eeff4 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Sat, 20 Apr 2024 02:23:32 +0200
Subject: [PATCH 106/272] Unify Copy-On-Write and Spilling (#15436)

This is the final step to unify COW and spilling. Now, `SpillableBuffer` inherits from `ExposureTrackedBuffer` so the final class hierarchy becomes:
```
SpillableBufferOwner -> BufferOwner
SpillableBuffer -> ExposureTrackedBuffer -> Buffer
```

Additionally, spill-on-demand is now set globally using `set_spill_on_demand_globally()` instead of in the `SpillManager` constructor.

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15436
---
 python/cudf/cudf/core/buffer/buffer.py        |  69 +++++----
 .../core/buffer/exposure_tracked_buffer.py    |  18 +--
 python/cudf/cudf/core/buffer/spill_manager.py | 101 +++++++++----
 .../cudf/cudf/core/buffer/spillable_buffer.py |  59 ++++----
 python/cudf/cudf/core/buffer/utils.py         |   4 +-
 python/cudf/cudf/options.py                   |  15 +-
 python/cudf/cudf/tests/test_copying.py        |  13 +-
 python/cudf/cudf/tests/test_spilling.py       | 140 +++++++++++++++---
 8 files changed, 284 insertions(+), 135 deletions(-)

diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index 1631fa00412..b2aba4f978b 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -106,8 +106,25 @@ class BufferOwner(Serializable):
     been accessed outside of BufferOwner. In this case, we have no control
     over knowing if the data is being modified by a third party.
 
-    Use `_from_device_memory` and `_from_host_memory` to create
+    Use `from_device_memory` and `from_host_memory` to create
     a new instance from either device or host memory respectively.
+
+    Parameters
+    ----------
+    ptr
+        An integer representing a pointer to memory.
+    size
+        The size of the memory in nbytes
+    owner
+        Python object to which the lifetime of the memory allocation is tied.
+        This buffer will keep a reference to `owner`.
+    exposed
+        Pointer to the underlying memory
+
+    Raises
+    ------
+    ValueError
+        If size is negative
     """
 
     _ptr: int
@@ -117,14 +134,25 @@ class BufferOwner(Serializable):
     # The set of buffers that point to this owner.
     _slices: weakref.WeakSet[Buffer]
 
-    def __init__(self):
-        raise ValueError(
-            f"do not create a {self.__class__} directly, please "
-            "use the factory function `cudf.core.buffer.as_buffer`"
-        )
+    def __init__(
+        self,
+        *,
+        ptr: int,
+        size: int,
+        owner: object,
+        exposed: bool,
+    ):
+        if size < 0:
+            raise ValueError("size cannot be negative")
+
+        self._ptr = ptr
+        self._size = size
+        self._owner = owner
+        self._exposed = exposed
+        self._slices = weakref.WeakSet()
 
     @classmethod
-    def _from_device_memory(cls, data: Any, exposed: bool) -> Self:
+    def from_device_memory(cls, data: Any, exposed: bool) -> Self:
         """Create from an object providing a `__cuda_array_interface__`.
 
         No data is being copied.
@@ -151,24 +179,15 @@ def _from_device_memory(cls, data: Any, exposed: bool) -> Self:
             If the resulting buffer has negative size
         """
 
-        # Bypass `__init__` and initialize attributes manually
-        ret = cls.__new__(cls)
-        ret._owner = data
-        ret._exposed = exposed
-        ret._slices = weakref.WeakSet()
         if isinstance(data, rmm.DeviceBuffer):  # Common case shortcut
-            ret._ptr = data.ptr
-            ret._size = data.size
+            ptr = data.ptr
+            size = data.size
         else:
-            ret._ptr, ret._size = get_ptr_and_size(
-                data.__cuda_array_interface__
-            )
-        if ret.size < 0:
-            raise ValueError("size cannot be negative")
-        return ret
+            ptr, size = get_ptr_and_size(data.__cuda_array_interface__)
+        return cls(ptr=ptr, size=size, owner=data, exposed=exposed)
 
     @classmethod
-    def _from_host_memory(cls, data: Any) -> Self:
+    def from_host_memory(cls, data: Any) -> Self:
         """Create an owner from a buffer or array like object
 
         Data must implement `__array_interface__`, the buffer protocol, and/or
@@ -196,7 +215,7 @@ def _from_host_memory(cls, data: Any) -> Self:
         # Copy to device memory
         buf = rmm.DeviceBuffer(ptr=ptr, size=size)
         # Create from device memory
-        return cls._from_device_memory(buf, exposed=False)
+        return cls.from_device_memory(buf, exposed=False)
 
     @property
     def size(self) -> int:
@@ -375,7 +394,7 @@ def copy(self, deep: bool = True) -> Self:
             )
 
         # Otherwise, we create a new copy of the memory
-        owner = self._owner._from_device_memory(
+        owner = self._owner.from_device_memory(
             rmm.DeviceBuffer(
                 ptr=self._owner.get_ptr(mode="read") + self._offset,
                 size=self.size,
@@ -439,9 +458,9 @@ def deserialize(cls, header: dict, frames: list) -> Self:
 
         owner_type: BufferOwner = pickle.loads(header["owner-type-serialized"])
         if hasattr(frame, "__cuda_array_interface__"):
-            owner = owner_type._from_device_memory(frame, exposed=False)
+            owner = owner_type.from_device_memory(frame, exposed=False)
         else:
-            owner = owner_type._from_host_memory(frame)
+            owner = owner_type.from_host_memory(frame)
         return cls(
             owner=owner,
             offset=0,
diff --git a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
index 4c08016adbb..15f00fc670d 100644
--- a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
+++ b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
@@ -23,8 +23,6 @@ class ExposureTrackedBuffer(Buffer):
         The size of the slice (in bytes)
     """
 
-    _owner: BufferOwner
-
     def __init__(
         self,
         owner: BufferOwner,
@@ -32,11 +30,7 @@ def __init__(
         size: Optional[int] = None,
     ) -> None:
         super().__init__(owner=owner, offset=offset, size=size)
-        self._owner._slices.add(self)
-
-    @property
-    def exposed(self) -> bool:
-        return self._owner.exposed
+        self.owner._slices.add(self)
 
     def get_ptr(self, *, mode: Literal["read", "write"]) -> int:
         if mode == "write" and cudf.get_option("copy_on_write"):
@@ -72,7 +66,7 @@ def copy(self, deep: bool = True) -> Self:
             copy-on-write option (see above).
         """
         if cudf.get_option("copy_on_write"):
-            return super().copy(deep=deep or self.exposed)
+            return super().copy(deep=deep or self.owner.exposed)
         return super().copy(deep=deep)
 
     @property
@@ -98,11 +92,11 @@ def make_single_owner_inplace(self) -> None:
             Buffer representing the same device memory as `data`
         """
 
-        if len(self._owner._slices) > 1:
-            # If this is not the only slice pointing to `self._owner`, we
-            # point to a new deep copy of the owner.
+        if len(self.owner._slices) > 1:
+            # If this is not the only slice pointing to `self.owner`, we
+            # point to a new copy of our slice of `self.owner`.
             t = self.copy(deep=True)
-            self._owner = t._owner
+            self._owner = t.owner
             self._offset = t._offset
             self._size = t._size
             self._owner._slices.add(self)
diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py
index 3e654e01401..cd81149bdb8 100644
--- a/python/cudf/cudf/core/buffer/spill_manager.py
+++ b/python/cudf/cudf/core/buffer/spill_manager.py
@@ -10,6 +10,7 @@
 import warnings
 import weakref
 from collections import defaultdict
+from contextlib import contextmanager
 from dataclasses import dataclass
 from functools import partial
 from typing import Dict, List, Optional, Tuple
@@ -201,10 +202,6 @@ class SpillManager:
     This class implements tracking of all known spillable buffers, on-demand
     spilling of said buffers, and (optionally) maintains a memory usage limit.
 
-    When `spill_on_demand=True`, the manager registers an RMM out-of-memory
-    error handler, which will spill spillable buffers in order to free up
-    memory.
-
     When `device_memory_limit=<limit-in-bytes>`, the manager will try keep
     the device memory usage below the specified limit by spilling of spillable
     buffers continuously, which will introduce a modest overhead.
@@ -213,8 +210,6 @@ class SpillManager:
 
     Parameters
     ----------
-    spill_on_demand : bool
-        Enable spill on demand.
     device_memory_limit: int, optional
         If not None, this is the device memory limit in bytes that triggers
         device to host spilling. The global manager sets this to the value
@@ -230,30 +225,15 @@ class SpillManager:
     def __init__(
         self,
         *,
-        spill_on_demand: bool = False,
         device_memory_limit: Optional[int] = None,
         statistic_level: int = 0,
     ) -> None:
         self._lock = threading.Lock()
         self._buffers = weakref.WeakValueDictionary()
         self._id_counter = 0
-        self._spill_on_demand = spill_on_demand
         self._device_memory_limit = device_memory_limit
         self.statistics = SpillStatistics(statistic_level)
 
-        if self._spill_on_demand:
-            # Set the RMM out-of-memory handle if not already set
-            mr = rmm.mr.get_current_device_resource()
-            if all(
-                not isinstance(m, rmm.mr.FailureCallbackResourceAdaptor)
-                for m in get_rmm_memory_resource_stack(mr)
-            ):
-                rmm.mr.set_current_device_resource(
-                    rmm.mr.FailureCallbackResourceAdaptor(
-                        mr, self._out_of_memory_handle
-                    )
-                )
-
     def _out_of_memory_handle(self, nbytes: int, *, retry_once=True) -> bool:
         """Try to handle an out-of-memory error by spilling
 
@@ -408,8 +388,7 @@ def __repr__(self) -> str:
             dev_limit = format_bytes(self._device_memory_limit)
 
         return (
-            f"<SpillManager spill_on_demand={self._spill_on_demand} "
-            f"device_memory_limit={dev_limit} | "
+            f"<SpillManager device_memory_limit={dev_limit} | "
             f"{format_bytes(spilled)} spilled | "
             f"{format_bytes(unspilled)} ({unspillable_ratio:.0%}) "
             f"unspilled (unspillable)>"
@@ -442,12 +421,82 @@ def get_global_manager() -> Optional[SpillManager]:
     """Get the global manager or None if spilling is disabled"""
     global _global_manager_uninitialized
     if _global_manager_uninitialized:
-        manager = None
         if get_option("spill"):
             manager = SpillManager(
-                spill_on_demand=get_option("spill_on_demand"),
                 device_memory_limit=get_option("spill_device_limit"),
                 statistic_level=get_option("spill_stats"),
             )
-        set_global_manager(manager)
+            set_global_manager(manager)
+            if get_option("spill_on_demand"):
+                set_spill_on_demand_globally()
+        else:
+            set_global_manager(None)
     return _global_manager
+
+
+def set_spill_on_demand_globally() -> None:
+    """Enable spill on demand in the current global spill manager.
+
+    Warning: this modifies the current RMM memory resource. A memory resource
+    to handle out-of-memory errors is pushed onto the RMM memory resource stack.
+
+    Raises
+    ------
+    ValueError
+        If no global spill manager exists (spilling is disabled).
+    ValueError
+        If a failure callback resource is already in the resource stack.
+    """
+
+    manager = get_global_manager()
+    if manager is None:
+        raise ValueError(
+            "Cannot enable spill on demand with no global spill manager"
+        )
+    mr = rmm.mr.get_current_device_resource()
+    if any(
+        isinstance(m, rmm.mr.FailureCallbackResourceAdaptor)
+        for m in get_rmm_memory_resource_stack(mr)
+    ):
+        raise ValueError(
+            "Spill on demand (or another failure callback resource) "
+            "is already registered"
+        )
+    rmm.mr.set_current_device_resource(
+        rmm.mr.FailureCallbackResourceAdaptor(
+            mr, manager._out_of_memory_handle
+        )
+    )
+
+
+@contextmanager
+def spill_on_demand_globally():
+    """Context to enable spill on demand temporarily.
+
+    Warning: this modifies the current RMM memory resource. A memory resource
+    to handle out-of-memory errors is pushed onto the RMM memory resource stack
+    when entering the context and popped again when exiting.
+
+    Raises
+    ------
+    ValueError
+        If no global spill manager exists (spilling is disabled).
+    ValueError
+        If a failure callback resource is already in the resource stack.
+    ValueError
+        If the RMM memory source stack was changed while in the context.
+    """
+    set_spill_on_demand_globally()
+    # Save the new memory resource stack for later cleanup
+    mr_stack = get_rmm_memory_resource_stack(
+        rmm.mr.get_current_device_resource()
+    )
+    try:
+        yield
+    finally:
+        mr = rmm.mr.get_current_device_resource()
+        if mr_stack != get_rmm_memory_resource_stack(mr):
+            raise ValueError(
+                "RMM memory source stack was changed while in the context"
+            )
+        rmm.mr.set_current_device_resource(mr_stack[1])
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index a9569190e75..a1af3ba8c9d 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -20,6 +20,7 @@
     cuda_array_interface_wrapper,
     host_memory_allocation,
 )
+from cudf.core.buffer.exposure_tracked_buffer import ExposureTrackedBuffer
 from cudf.utils.nvtx_annotation import _get_color_for_nvtx, annotate
 from cudf.utils.string import format_bytes
 
@@ -93,8 +94,8 @@ class SpillableBufferOwner(BufferOwner):
     def _finalize_init(self, ptr_desc: Dict[str, Any]) -> None:
         """Finish initialization of the spillable buffer
 
-        This implements the common initialization that `_from_device_memory`
-        and `_from_host_memory` are missing.
+        This implements the common initialization that `from_device_memory`
+        and `from_host_memory` are missing.
 
         Parameters
         ----------
@@ -119,7 +120,7 @@ def _finalize_init(self, ptr_desc: Dict[str, Any]) -> None:
         self._manager.add(self)
 
     @classmethod
-    def _from_device_memory(cls, data: Any, exposed: bool) -> Self:
+    def from_device_memory(cls, data: Any, exposed: bool) -> Self:
         """Create a spillabe buffer from device memory.
 
         No data is being copied.
@@ -136,12 +137,12 @@ def _from_device_memory(cls, data: Any, exposed: bool) -> Self:
         SpillableBufferOwner
             Buffer representing the same device memory as `data`
         """
-        ret = super()._from_device_memory(data, exposed=exposed)
+        ret = super().from_device_memory(data, exposed=exposed)
         ret._finalize_init(ptr_desc={"type": "gpu"})
         return ret
 
     @classmethod
-    def _from_host_memory(cls, data: Any) -> Self:
+    def from_host_memory(cls, data: Any) -> Self:
         """Create a spillabe buffer from host memory.
 
         Data must implement `__array_interface__`, the buffer protocol, and/or
@@ -170,11 +171,7 @@ def _from_host_memory(cls, data: Any) -> Self:
         data = data.cast("B")  # Make sure itemsize==1
 
         # Create an already spilled buffer
-        ret = cls.__new__(cls)
-        ret._owner = None
-        ret._ptr = 0
-        ret._size = data.nbytes
-        ret._exposed = False
+        ret = cls(ptr=0, size=data.nbytes, owner=None, exposed=False)
         ret._finalize_init(ptr_desc={"type": "cpu", "memoryview": data})
         return ret
 
@@ -372,21 +369,8 @@ def __str__(self) -> str:
         )
 
 
-class SpillableBuffer(Buffer):
-    """A slice of a spillable buffer
-
-    This buffer applies the slicing and then delegates all
-    operations to its owning buffer.
-
-    Parameters
-    ----------
-    owner : SpillableBufferOwner
-        The owner of the view
-    offset : int
-        Memory offset into the owning buffer
-    size : int
-        Size of the view (in bytes)
-    """
+class SpillableBuffer(ExposureTrackedBuffer):
+    """A slice of a spillable buffer"""
 
     _owner: SpillableBufferOwner
 
@@ -397,10 +381,6 @@ def spill(self, target: str = "cpu") -> None:
     def is_spilled(self) -> bool:
         return self._owner.is_spilled
 
-    @property
-    def exposed(self) -> bool:
-        return self._owner.exposed
-
     @property
     def spillable(self) -> bool:
         return self._owner.spillable
@@ -412,9 +392,6 @@ def memory_info(self) -> Tuple[int, int, str]:
         (ptr, _, device_type) = self._owner.memory_info()
         return (ptr + self._offset, self.nbytes, device_type)
 
-    def mark_exposed(self) -> None:
-        self._owner.mark_exposed()
-
     def serialize(self) -> Tuple[dict, list]:
         """Serialize the Buffer
 
@@ -449,7 +426,7 @@ def serialize(self) -> Tuple[dict, list]:
                 ptr, size, _ = self.memory_info()
                 frames = [
                     Buffer(
-                        owner=BufferOwner._from_device_memory(
+                        owner=BufferOwner.from_device_memory(
                             cuda_array_interface_wrapper(
                                 ptr=ptr,
                                 size=size,
@@ -461,6 +438,22 @@ def serialize(self) -> Tuple[dict, list]:
                 ]
             return header, frames
 
+    def copy(self, deep: bool = True) -> Self:
+        from cudf.core.buffer.utils import acquire_spill_lock
+
+        if not deep:
+            return super().copy(deep=False)
+
+        if self.is_spilled:
+            # In this case, we make the new copy point to the same spilled
+            # data in host memory. We can do this since spilled data is never
+            # modified.
+            owner = self._owner.from_host_memory(self.memoryview())
+            return self.__class__(owner=owner, offset=0, size=owner.size)
+
+        with acquire_spill_lock():
+            return super().copy(deep=deep)
+
     @property
     def __cuda_array_interface__(self) -> dict:
         return {
diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py
index c2ec7effd13..3346d05ed4a 100644
--- a/python/cudf/cudf/core/buffer/utils.py
+++ b/python/cudf/cudf/core/buffer/utils.py
@@ -133,13 +133,13 @@ def as_buffer(
     if not hasattr(data, "__cuda_array_interface__"):
         if exposed:
             raise ValueError("cannot created exposed host memory")
-        return buffer_class(owner=owner_class._from_host_memory(data))
+        return buffer_class(owner=owner_class.from_host_memory(data))
 
     # Check if `data` is owned by a known class
     owner = get_buffer_owner(data)
     if owner is None:  # `data` is new device memory
         return buffer_class(
-            owner=owner_class._from_device_memory(data, exposed=exposed)
+            owner=owner_class.from_device_memory(data, exposed=exposed)
         )
 
     # At this point, we know that `data` is owned by a known class, which
diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py
index 7a0db49bd20..efa8eabd8b8 100644
--- a/python/cudf/cudf/options.py
+++ b/python/cudf/cudf/options.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 import os
 import textwrap
@@ -152,11 +152,6 @@ def _validator(val):
 
 
 def _cow_validator(val):
-    if get_option("spill") and val:
-        raise ValueError(
-            "Copy-on-write is not supported when spilling is enabled. "
-            "Please set `spill` to `False`"
-        )
     if val not in {False, True}:
         raise ValueError(
             f"{val} is not a valid option. Must be one of {{False, True}}."
@@ -164,14 +159,6 @@ def _cow_validator(val):
 
 
 def _spill_validator(val):
-    try:
-        if get_option("copy_on_write") and val:
-            raise ValueError(
-                "Spilling is not supported when copy-on-write is enabled. "
-                "Please set `copy_on_write` to `False`"
-            )
-    except KeyError:
-        pass
     if val not in {False, True}:
         raise ValueError(
             f"{val} is not a valid option. Must be one of {{False, True}}."
diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py
index e737a73e86b..0bc9ffa8004 100644
--- a/python/cudf/cudf/tests/test_copying.py
+++ b/python/cudf/cudf/tests/test_copying.py
@@ -7,8 +7,11 @@
 
 import cudf
 from cudf import Series
+from cudf.core.buffer.spill_manager import get_global_manager
 from cudf.testing._utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq
 
+pytestmark = pytest.mark.spilling
+
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + OTHER_TYPES)
 def test_repeat(dtype):
@@ -302,6 +305,8 @@ def test_series_zero_copy_cow_on():
 
 
 def test_series_zero_copy_cow_off():
+    is_spill_enabled = get_global_manager() is not None
+
     with cudf.option_context("copy_on_write", False):
         s = cudf.Series([1, 2, 3, 4, 5])
         s1 = s.copy(deep=False)
@@ -334,8 +339,12 @@ def test_series_zero_copy_cow_off():
         assert_eq(s, cudf.Series([20, 10, 10, 4, 5]))
         assert_eq(s1, cudf.Series([20, 10, 10, 4, 5]))
         assert_eq(cp_array, cp.array([20, 10, 10, 4, 5]))
-        assert_eq(s2, cudf.Series([20, 10, 10, 4, 5]))
-        assert_eq(s3, cudf.Series([20, 10, 10, 4, 5]))
+        if not is_spill_enabled:
+            # Since spilling might make a copy of the data, we cannot
+            # expect the two series to be a zero-copy of the cupy array
+            # when spilling is enabled globally.
+            assert_eq(s2, cudf.Series([20, 10, 10, 4, 5]))
+            assert_eq(s3, cudf.Series([20, 10, 10, 4, 5]))
 
         s4 = cudf.Series([10, 20, 30, 40, 50])
         s5 = cudf.Series(s4)
diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py
index f18cb32a091..913a958b4c2 100644
--- a/python/cudf/cudf/tests/test_spilling.py
+++ b/python/cudf/cudf/tests/test_spilling.py
@@ -32,6 +32,7 @@
     get_global_manager,
     get_rmm_memory_resource_stack,
     set_global_manager,
+    spill_on_demand_globally,
 )
 from cudf.core.buffer.spillable_buffer import (
     SpillableBuffer,
@@ -47,6 +48,22 @@
     )
 
 
+@contextlib.contextmanager
+def set_rmm_memory_pool(nbytes: int):
+    mr = rmm.mr.get_current_device_resource()
+    rmm.mr.set_current_device_resource(
+        rmm.mr.PoolMemoryResource(
+            mr,
+            initial_pool_size=nbytes,
+            maximum_pool_size=nbytes,
+        )
+    )
+    try:
+        yield
+    finally:
+        rmm.mr.set_current_device_resource(mr)
+
+
 def single_column_df(target="gpu") -> cudf.DataFrame:
     """Create a standard single column dataframe used for testing
 
@@ -120,18 +137,18 @@ def test_spillable_buffer(manager: SpillManager):
     buf = as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False)
     assert isinstance(buf, SpillableBuffer)
     assert buf.spillable
-    buf.mark_exposed()
-    assert buf.exposed
+    buf.owner.mark_exposed()
+    assert buf.owner.exposed
     assert not buf.spillable
     buf = as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False)
     # Notice, accessing `__cuda_array_interface__` itself doesn't
     # expose the pointer, only accessing the "data" field exposes
     # the pointer.
     iface = buf.__cuda_array_interface__
-    assert not buf.exposed
+    assert not buf.owner.exposed
     assert buf.spillable
     iface["data"][0]  # Expose pointer
-    assert buf.exposed
+    assert buf.owner.exposed
     assert not buf.spillable
 
 
@@ -141,7 +158,6 @@ def test_spillable_buffer(manager: SpillManager):
         "get_ptr",
         "memoryview",
         "is_spilled",
-        "exposed",
         "spillable",
         "spill_lock",
         "spill",
@@ -210,7 +226,7 @@ def test_spilling_buffer(manager: SpillManager):
     buf = as_buffer(rmm.DeviceBuffer(size=10), exposed=False)
     buf.spill(target="cpu")
     assert buf.is_spilled
-    buf.mark_exposed()  # Expose pointer and trigger unspill
+    buf.owner.mark_exposed()  # Expose pointer and trigger unspill
     assert not buf.is_spilled
     with pytest.raises(ValueError, match="unspillable buffer"):
         buf.spill(target="cpu")
@@ -237,7 +253,7 @@ def _get_manager_in_env(monkeypatch, var_vals):
 def test_environment_variables_spill_off(monkeypatch):
     with _get_manager_in_env(
         monkeypatch,
-        [("CUDF_SPILL", "off"), ("CUDF_SPILL_ON_DEMAND", "off")],
+        [("CUDF_SPILL", "off")],
     ) as manager:
         assert manager is None
 
@@ -245,10 +261,9 @@ def test_environment_variables_spill_off(monkeypatch):
 def test_environment_variables_spill_on(monkeypatch):
     with _get_manager_in_env(
         monkeypatch,
-        [("CUDF_SPILL", "on")],
+        [("CUDF_SPILL", "on"), ("CUDF_SPILL_ON_DEMAND", "off")],
     ) as manager:
         assert isinstance(manager, SpillManager)
-        assert manager._spill_on_demand is True
         assert manager._device_memory_limit is None
         assert manager.statistics.level == 0
 
@@ -256,7 +271,11 @@ def test_environment_variables_spill_on(monkeypatch):
 def test_environment_variables_device_limit(monkeypatch):
     with _get_manager_in_env(
         monkeypatch,
-        [("CUDF_SPILL", "on"), ("CUDF_SPILL_DEVICE_LIMIT", "1000")],
+        [
+            ("CUDF_SPILL", "on"),
+            ("CUDF_SPILL_ON_DEMAND", "off"),
+            ("CUDF_SPILL_DEVICE_LIMIT", "1000"),
+        ],
     ) as manager:
         assert isinstance(manager, SpillManager)
         assert manager._device_memory_limit == 1000
@@ -269,6 +288,7 @@ def test_environment_variables_spill_stats(monkeypatch, level):
         monkeypatch,
         [
             ("CUDF_SPILL", "on"),
+            ("CUDF_SPILL_ON_DEMAND", "off"),
             ("CUDF_SPILL_DEVICE_LIMIT", "1000"),
             ("CUDF_SPILL_STATS", f"{level}"),
         ],
@@ -529,12 +549,8 @@ def test_serialize_cuda_dataframe(manager: SpillManager):
     assert_eq(df1, df2)
 
 
-@pytest.mark.skip(
-    reason="This test is not safe because other tests may have enabled"
-    "spilling and already modified rmm's global state"
-)
 def test_get_rmm_memory_resource_stack():
-    mr1 = rmm.mr.get_current_device_resource()
+    mr1 = rmm.mr.CudaMemoryResource()
     assert all(
         not isinstance(m, rmm.mr.FailureCallbackResourceAdaptor)
         for m in get_rmm_memory_resource_stack(mr1)
@@ -560,9 +576,9 @@ def test_df_transpose(manager: SpillManager):
     df1 = cudf.DataFrame({"a": [1, 2]})
     df2 = df1.transpose()
     # For now, all buffers are marked as exposed
-    assert df1._data._data["a"].data.exposed
-    assert df2._data._data[0].data.exposed
-    assert df2._data._data[1].data.exposed
+    assert df1._data._data["a"].data.owner.exposed
+    assert df2._data._data[0].data.owner.exposed
+    assert df2._data._data[1].data.owner.exposed
 
 
 def test_as_buffer_of_spillable_buffer(manager: SpillManager):
@@ -651,7 +667,7 @@ def test_statistics_expose(manager: SpillManager):
     ]
 
     # Expose the first buffer
-    buffers[0].mark_exposed()
+    buffers[0].owner.mark_exposed()
     assert len(manager.statistics.exposes) == 1
     stat = list(manager.statistics.exposes.values())[0]
     assert stat.count == 1
@@ -660,7 +676,7 @@ def test_statistics_expose(manager: SpillManager):
 
     # Expose all 10 buffers
     for i in range(10):
-        buffers[i].mark_exposed()
+        buffers[i].owner.mark_exposed()
 
     # The rest of the ptr accesses should accumulate to a single stat
     # because they resolve to the same traceback.
@@ -680,9 +696,91 @@ def test_statistics_expose(manager: SpillManager):
 
     # Expose the new buffers and check that they are counted as spilled
     for i in range(10):
-        buffers[i].mark_exposed()
+        buffers[i].owner.mark_exposed()
     assert len(manager.statistics.exposes) == 3
     stat = list(manager.statistics.exposes.values())[2]
     assert stat.count == 10
     assert stat.total_nbytes == buffers[0].nbytes * 10
     assert stat.spilled_nbytes == buffers[0].nbytes * 10
+
+
+def test_spill_on_demand(manager: SpillManager):
+    with set_rmm_memory_pool(1024):
+        a = as_buffer(data=rmm.DeviceBuffer(size=1024))
+        assert isinstance(a, SpillableBuffer)
+        assert not a.is_spilled
+
+        with pytest.raises(MemoryError, match="Maximum pool size exceeded"):
+            as_buffer(data=rmm.DeviceBuffer(size=1024))
+
+        with spill_on_demand_globally():
+            b = as_buffer(data=rmm.DeviceBuffer(size=1024))
+            assert a.is_spilled
+            assert not b.is_spilled
+
+        with pytest.raises(MemoryError, match="Maximum pool size exceeded"):
+            as_buffer(data=rmm.DeviceBuffer(size=1024))
+
+
+def test_spilling_and_copy_on_write(manager: SpillManager):
+    with cudf.option_context("copy_on_write", True):
+        a: SpillableBuffer = as_buffer(data=rmm.DeviceBuffer(size=10))
+
+        b = a.copy(deep=False)
+        assert a.owner == b.owner
+        a.spill(target="cpu")
+        assert a.is_spilled
+        assert b.is_spilled
+
+        # Write access trigger copy of `a` into `b` but since `a` is spilled
+        # the copy is done in host memory and `a` remains spilled.
+        with acquire_spill_lock():
+            b.get_ptr(mode="write")
+        assert a.is_spilled
+        assert not b.is_spilled
+
+        # Deep copy of the spilled buffer `a`
+        b = a.copy(deep=True)
+        assert a.owner != b.owner
+        assert a.is_spilled
+        assert b.is_spilled
+        a.spill(target="gpu")
+        assert not a.is_spilled
+        assert b.is_spilled
+
+        # Deep copy of the unspilled buffer `a`
+        b = a.copy(deep=True)
+        assert a.spillable
+        assert not a.is_spilled
+        assert not b.is_spilled
+
+        b = a.copy(deep=False)
+        assert a.owner == b.owner
+        # Write access trigger copy of `a` into `b` in device memory
+        with acquire_spill_lock():
+            b.get_ptr(mode="write")
+        assert a.owner != b.owner
+        assert not a.is_spilled
+        assert not b.is_spilled
+        # And `a` and `b` is now seperated with there one spilling status
+        a.spill(target="cpu")
+        assert a.is_spilled
+        assert not b.is_spilled
+        b.spill(target="cpu")
+        assert a.is_spilled
+        assert b.is_spilled
+
+        # Read access with a spill lock unspill `a` and allows copy-on-write
+        with acquire_spill_lock():
+            a.get_ptr(mode="read")
+        b = a.copy(deep=False)
+        assert a.owner == b.owner
+        assert not a.is_spilled
+
+        # Read access without a spill lock exposes `a` and forces a deep copy
+        a.get_ptr(mode="read")
+        b = a.copy(deep=False)
+        assert a.owner != b.owner
+        assert not a.is_spilled
+        assert a.owner.exposed
+        assert not b.owner.exposed

From 14854b14fe2878f801319eca8d6cd1d5685b9ca6 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Apr 2024 14:33:20 -1000
Subject: [PATCH 107/272] Enable test_parsing in cudf.pandas tests (#15460)

closes https://github.com/rapidsai/cudf/issues/15432

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15460
---
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 784d90a40ed..af7fa72d44e 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -22,8 +22,7 @@ set -euo pipefail
 # of Pandas installed.
 PANDAS_VERSION=$(python -c "import pandas; print(pandas.__version__)")
 
-PYTEST_IGNORES="--ignore=tests/tslibs/test_parsing.py \
---ignore=tests/io/parser/common/test_read_errors.py"
+PYTEST_IGNORES="--ignore=tests/io/parser/common/test_read_errors.py"
 
 mkdir -p pandas-testing
 cd pandas-testing

From 9fa247ff7db104517f4e9dab0fc3c321e76cccdf Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 22 Apr 2024 08:28:42 -0400
Subject: [PATCH 108/272] Add to_arrow_device() functions that accept views
 (#15465)

Adds the following new interop functions
```
unique_device_array_t to_arrow_device(cudf::table_view const& table,
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr);
unique_device_array_t to_arrow_device(cudf::column_view const& col,
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr);
```
Also refactors some common code with the ownership transfer version of these APIs.
And moves the `to_arrow_schema()` functions to a separate .cpp file.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Matt Topol (https://github.com/zeroshade)
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15465
---
 cpp/CMakeLists.txt                         |   2 +
 cpp/include/cudf/interop.hpp               |  66 +-
 cpp/src/interop/to_arrow_device.cu         | 745 ++++++++++-----------
 cpp/src/interop/to_arrow_schema.cpp        | 231 +++++++
 cpp/src/interop/to_arrow_utilities.cpp     |  44 ++
 cpp/src/interop/to_arrow_utilities.hpp     |  34 +
 cpp/tests/interop/to_arrow_device_test.cpp |  78 ++-
 7 files changed, 801 insertions(+), 399 deletions(-)
 create mode 100644 cpp/src/interop/to_arrow_schema.cpp
 create mode 100644 cpp/src/interop/to_arrow_utilities.cpp
 create mode 100644 cpp/src/interop/to_arrow_utilities.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7d62e0acb10..3c7e10c9bc4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -359,6 +359,8 @@ add_library(
   src/interop/from_arrow.cu
   src/interop/to_arrow.cu
   src/interop/to_arrow_device.cu
+  src/interop/to_arrow_schema.cpp
+  src/interop/to_arrow_utilities.cpp
   src/interop/detail/arrow_allocator.cpp
   src/io/avro/avro.cpp
   src/io/avro/avro_gpu.cu
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index dc4d66a8f6e..defc1fc834c 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -258,6 +258,70 @@ unique_device_array_t to_arrow_device(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Create `ArrowDeviceArray` from a table view
+ *
+ * Populates the C struct ArrowDeviceArray performing copies only if necessary.
+ * This wraps the data on the GPU device and gives a view of the table data
+ * to the ArrowDeviceArray struct. If the caller frees the data referenced by
+ * the table_view, using the returned object results in undefined behavior.
+ *
+ * After calling this function, the release callback on the returned ArrowDeviceArray
+ * must be called to clean up any memory created during conversion.
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf
+ * it will be converted to an Arrow decimal128 with the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
+ * converted to Arrow decimal128 of the precision 38.
+ *
+ * Copies will be performed in the cases where cudf differs from Arrow:
+ * - BOOL8: Arrow uses a bitmap and cudf uses 1 byte per value
+ * - DECIMAL32 and DECIMAL64: Converted to Arrow decimal128
+ * - STRING: Arrow expects a single value int32 offset child array for empty strings columns
+ *
+ * @param table Input table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used for any allocations during conversion
+ * @return ArrowDeviceArray which will have ownership of any copied data
+ */
+unique_device_array_t to_arrow_device(
+  cudf::table_view const& table,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Create `ArrowDeviceArray` from a column view
+ *
+ * Populates the C struct ArrowDeviceArray performing copies only if necessary.
+ * This wraps the data on the GPU device and gives a view of the column data
+ * to the ArrowDeviceArray struct. If the caller frees the data referenced by
+ * the column_view, using the returned object results in undefined behavior.
+ *
+ * After calling this function, the release callback on the returned ArrowDeviceArray
+ * must be called to clean up any memory created during conversion.
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf
+ * it will be converted to an Arrow decimal128 with the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similar, numeric::decimal128 will be
+ * converted to Arrow decimal128 of the precision 38.
+ *
+ * Copies will be performed in the cases where cudf differs from Arrow:
+ * - BOOL8: Arrow uses a bitmap and cudf uses 1 byte per value
+ * - DECIMAL32 and DECIMAL64: Converted to Arrow decimal128
+ * - STRING: Arrow expects a single value int32 offset child array for empty strings columns
+ *
+ * @param col Input column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used for any allocations during conversion
+ * @return ArrowDeviceArray which will have ownership of any copied data
+ */
+unique_device_array_t to_arrow_device(
+  cudf::column_view const& col,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Create `cudf::table` from given arrow Table input
  *
@@ -266,7 +330,6 @@ unique_device_array_t to_arrow_device(
  * @param mr    Device memory resource used to allocate `cudf::table`
  * @return cudf table generated from given arrow Table
  */
-
 std::unique_ptr<table> from_arrow(
   arrow::Table const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
@@ -280,7 +343,6 @@ std::unique_ptr<table> from_arrow(
  * @param mr    Device memory resource used to allocate `cudf::scalar`
  * @return cudf scalar generated from given arrow Scalar
  */
-
 std::unique_ptr<cudf::scalar> from_arrow(
   arrow::Scalar const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
index 1754d1493bd..737f8c7f625 100644
--- a/cpp/src/interop/to_arrow_device.cu
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -14,11 +14,14 @@
  * limitations under the License.
  */
 
+#include "to_arrow_utilities.hpp"
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/interop.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/interop.hpp>
 #include <cudf/interop/detail/arrow.hpp>
@@ -45,198 +48,10 @@
 namespace cudf {
 namespace detail {
 namespace {
+
 static constexpr int validity_buffer_idx         = 0;
 static constexpr int fixed_width_data_buffer_idx = 1;
 
-ArrowType id_to_arrow_type(cudf::type_id id)
-{
-  switch (id) {
-    case cudf::type_id::BOOL8: return NANOARROW_TYPE_BOOL;
-    case cudf::type_id::INT8: return NANOARROW_TYPE_INT8;
-    case cudf::type_id::INT16: return NANOARROW_TYPE_INT16;
-    case cudf::type_id::INT32: return NANOARROW_TYPE_INT32;
-    case cudf::type_id::INT64: return NANOARROW_TYPE_INT64;
-    case cudf::type_id::UINT8: return NANOARROW_TYPE_UINT8;
-    case cudf::type_id::UINT16: return NANOARROW_TYPE_UINT16;
-    case cudf::type_id::UINT32: return NANOARROW_TYPE_UINT32;
-    case cudf::type_id::UINT64: return NANOARROW_TYPE_UINT64;
-    case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT;
-    case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE;
-    case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32;
-    default: CUDF_FAIL("Unsupported type_id conversion to arrow type");
-  }
-}
-
-struct dispatch_to_arrow_type {
-  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
-  int operator()(column_view, column_metadata const&, ArrowSchema*)
-  {
-    CUDF_FAIL("Unsupported type for to_arrow_schema");
-  }
-
-  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
-  int operator()(column_view input_view, column_metadata const&, ArrowSchema* out)
-  {
-    cudf::type_id id = input_view.type().id();
-    switch (id) {
-      case cudf::type_id::TIMESTAMP_SECONDS:
-        return ArrowSchemaSetTypeDateTime(
-          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_SECOND, nullptr);
-      case cudf::type_id::TIMESTAMP_MILLISECONDS:
-        return ArrowSchemaSetTypeDateTime(
-          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr);
-      case cudf::type_id::TIMESTAMP_MICROSECONDS:
-        return ArrowSchemaSetTypeDateTime(
-          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MICRO, nullptr);
-      case cudf::type_id::TIMESTAMP_NANOSECONDS:
-        return ArrowSchemaSetTypeDateTime(
-          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_NANO, nullptr);
-      case cudf::type_id::DURATION_SECONDS:
-        return ArrowSchemaSetTypeDateTime(
-          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_SECOND, nullptr);
-      case cudf::type_id::DURATION_MILLISECONDS:
-        return ArrowSchemaSetTypeDateTime(
-          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_MILLI, nullptr);
-      case cudf::type_id::DURATION_MICROSECONDS:
-        return ArrowSchemaSetTypeDateTime(
-          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_MICRO, nullptr);
-      case cudf::type_id::DURATION_NANOSECONDS:
-        return ArrowSchemaSetTypeDateTime(
-          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_NANO, nullptr);
-      default: return ArrowSchemaSetType(out, id_to_arrow_type(id));
-    }
-  }
-};
-
-template <typename DeviceType>
-int decimals_to_arrow(column_view input, ArrowSchema* out)
-{
-  // Arrow doesn't support decimal32/decimal64 currently. decimal128
-  // is the smallest that arrow supports besides float32/float64 so we
-  // upcast to decimal128.
-  return ArrowSchemaSetTypeDecimal(out,
-                                   NANOARROW_TYPE_DECIMAL128,
-                                   cudf::detail::max_precision<DeviceType>(),
-                                   -input.type().scale());
-}
-
-template <>
-int dispatch_to_arrow_type::operator()<numeric::decimal32>(column_view input,
-                                                           column_metadata const&,
-                                                           ArrowSchema* out)
-{
-  using DeviceType = int32_t;
-  return decimals_to_arrow<DeviceType>(input, out);
-}
-
-template <>
-int dispatch_to_arrow_type::operator()<numeric::decimal64>(column_view input,
-                                                           column_metadata const&,
-                                                           ArrowSchema* out)
-{
-  using DeviceType = int64_t;
-  return decimals_to_arrow<DeviceType>(input, out);
-}
-
-template <>
-int dispatch_to_arrow_type::operator()<numeric::decimal128>(column_view input,
-                                                            column_metadata const&,
-                                                            ArrowSchema* out)
-{
-  using DeviceType = __int128_t;
-  return decimals_to_arrow<DeviceType>(input, out);
-}
-
-template <>
-int dispatch_to_arrow_type::operator()<cudf::string_view>(column_view input,
-                                                          column_metadata const&,
-                                                          ArrowSchema* out)
-{
-  return ArrowSchemaSetType(out, NANOARROW_TYPE_STRING);
-}
-
-// these forward declarations are needed due to the recursive calls to them
-// inside their definitions and in struct_vew for handling children
-template <>
-int dispatch_to_arrow_type::operator()<cudf::list_view>(column_view input,
-                                                        column_metadata const& metadata,
-                                                        ArrowSchema* out);
-
-template <>
-int dispatch_to_arrow_type::operator()<cudf::dictionary32>(column_view input,
-                                                           column_metadata const& metadata,
-                                                           ArrowSchema* out);
-
-template <>
-int dispatch_to_arrow_type::operator()<cudf::struct_view>(column_view input,
-                                                          column_metadata const& metadata,
-                                                          ArrowSchema* out)
-{
-  CUDF_EXPECTS(metadata.children_meta.size() == static_cast<std::size_t>(input.num_children()),
-               "Number of field names and number of children doesn't match\n");
-
-  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeStruct(out, input.num_children()));
-  for (int i = 0; i < input.num_children(); ++i) {
-    auto child = out->children[i];
-    auto col   = input.child(i);
-    ArrowSchemaInit(child);
-    NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(child, metadata.children_meta[i].name.c_str()));
-
-    child->flags = col.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
-
-    if (col.type().id() == cudf::type_id::EMPTY) {
-      NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(child, NANOARROW_TYPE_NA));
-      continue;
-    }
-
-    NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
-      col.type(), detail::dispatch_to_arrow_type{}, col, metadata.children_meta[i], child));
-  }
-
-  return NANOARROW_OK;
-}
-
-template <>
-int dispatch_to_arrow_type::operator()<cudf::list_view>(column_view input,
-                                                        column_metadata const& metadata,
-                                                        ArrowSchema* out)
-{
-  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(out, NANOARROW_TYPE_LIST));
-  auto child = input.child(cudf::lists_column_view::child_column_index);
-  ArrowSchemaInit(out->children[0]);
-  if (child.type().id() == cudf::type_id::EMPTY) {
-    return ArrowSchemaSetType(out->children[0], NANOARROW_TYPE_NA);
-  }
-  auto child_meta =
-    metadata.children_meta.empty() ? column_metadata{"element"} : metadata.children_meta[0];
-
-  out->flags = input.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
-  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(out->children[0], child_meta.name.c_str()));
-  out->children[0]->flags = child.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
-  return cudf::type_dispatcher(
-    child.type(), detail::dispatch_to_arrow_type{}, child, child_meta, out->children[0]);
-}
-
-template <>
-int dispatch_to_arrow_type::operator()<cudf::dictionary32>(column_view input,
-                                                           column_metadata const& metadata,
-                                                           ArrowSchema* out)
-{
-  cudf::dictionary_column_view dview{input};
-
-  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(out, id_to_arrow_type(dview.indices().type().id())));
-  NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateDictionary(out));
-  ArrowSchemaInit(out->dictionary);
-
-  auto dict_keys = dview.keys();
-  return cudf::type_dispatcher(
-    dict_keys.type(),
-    detail::dispatch_to_arrow_type{},
-    dict_keys,
-    metadata.children_meta.empty() ? column_metadata{"keys"} : metadata.children_meta[0],
-    out->dictionary);
-}
-
 template <typename T>
 void device_buffer_finalize(ArrowBufferAllocator* allocator, uint8_t*, int64_t)
 {
@@ -244,6 +59,14 @@ void device_buffer_finalize(ArrowBufferAllocator* allocator, uint8_t*, int64_t)
   delete unique_buffer;
 }
 
+int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column_view column)
+{
+  NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(arr, storage_type));
+  arr->length     = column.size();
+  arr->null_count = column.null_count();
+  return NANOARROW_OK;
+}
+
 template <typename>
 struct is_device_scalar : public std::false_type {};
 
@@ -279,19 +102,26 @@ int set_buffer(std::unique_ptr<T> device_buf, int64_t i, ArrowArray* out)
   return NANOARROW_OK;
 }
 
-int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column const& column)
+ArrowType id_to_arrow_storage_type(cudf::type_id id)
 {
-  NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(arr, storage_type));
-  arr->length     = column.size();
-  arr->null_count = column.null_count();
-  return NANOARROW_OK;
+  switch (id) {
+    case cudf::type_id::TIMESTAMP_SECONDS:
+    case cudf::type_id::TIMESTAMP_MILLISECONDS:
+    case cudf::type_id::TIMESTAMP_MICROSECONDS:
+    case cudf::type_id::TIMESTAMP_NANOSECONDS: return NANOARROW_TYPE_INT64;
+    case cudf::type_id::DURATION_SECONDS:
+    case cudf::type_id::DURATION_MILLISECONDS:
+    case cudf::type_id::DURATION_MICROSECONDS:
+    case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TYPE_INT64;
+    default: return id_to_arrow_type(id);
+  }
 }
 
 struct dispatch_to_arrow_device {
   template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
   int operator()(cudf::column&&, rmm::cuda_stream_view, rmm::device_async_resource_ref, ArrowArray*)
   {
-    CUDF_FAIL("Unsupported type for to_arrow_device");
+    CUDF_FAIL("Unsupported type for to_arrow_device", cudf::data_type_error);
   }
 
   template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
@@ -302,38 +132,34 @@ struct dispatch_to_arrow_device {
   {
     nanoarrow::UniqueArray tmp;
 
-    const ArrowType storage_type = [&] {
-      switch (column.type().id()) {
-        case cudf::type_id::TIMESTAMP_SECONDS:
-        case cudf::type_id::TIMESTAMP_MILLISECONDS:
-        case cudf::type_id::TIMESTAMP_MICROSECONDS:
-        case cudf::type_id::TIMESTAMP_NANOSECONDS: return NANOARROW_TYPE_INT64;
-        case cudf::type_id::DURATION_SECONDS:
-        case cudf::type_id::DURATION_MILLISECONDS:
-        case cudf::type_id::DURATION_MICROSECONDS:
-        case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TYPE_INT64;
-        default: return id_to_arrow_type(column.type().id());
-      }
-    }();
+    auto const storage_type = id_to_arrow_storage_type(column.type().id());
     NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), storage_type, column));
 
     auto contents = column.release();
+    NANOARROW_RETURN_NOT_OK(set_contents(contents, tmp.get()));
+
+    ArrowArrayMove(tmp.get(), out);
+    return NANOARROW_OK;
+  }
+
+  int set_null_mask(column::contents& contents, ArrowArray* out)
+  {
     if (contents.null_mask) {
-      NANOARROW_RETURN_NOT_OK(
-        set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
+      NANOARROW_RETURN_NOT_OK(set_buffer(std::move(contents.null_mask), validity_buffer_idx, out));
     }
+    return NANOARROW_OK;
+  }
 
-    NANOARROW_RETURN_NOT_OK(
-      set_buffer(std::move(contents.data), fixed_width_data_buffer_idx, tmp.get()));
-
-    ArrowArrayMove(tmp.get(), out);
+  int set_contents(column::contents& contents, ArrowArray* out)
+  {
+    NANOARROW_RETURN_NOT_OK(set_null_mask(contents, out));
+    NANOARROW_RETURN_NOT_OK(set_buffer(std::move(contents.data), fixed_width_data_buffer_idx, out));
     return NANOARROW_OK;
   }
 };
 
 template <typename DeviceType>
-int decimals_to_arrow(cudf::column&& input,
-                      int32_t precision,
+int decimals_to_arrow(cudf::column_view input,
                       rmm::cuda_stream_view stream,
                       rmm::device_async_resource_ref mr,
                       ArrowArray* out)
@@ -341,42 +167,28 @@ int decimals_to_arrow(cudf::column&& input,
   nanoarrow::UniqueArray tmp;
   NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, input));
 
-  if constexpr (!std::is_same_v<DeviceType, __int128_t>) {
-    constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType);
-    auto buf =
-      std::make_unique<rmm::device_uvector<DeviceType>>(input.size() * BIT_WIDTH_RATIO, stream, mr);
-
-    auto count = thrust::make_counting_iterator(0);
-
-    thrust::for_each(rmm::exec_policy(stream, mr),
-                     count,
-                     count + input.size(),
-                     [in  = input.view().begin<DeviceType>(),
-                      out = buf->data(),
-                      BIT_WIDTH_RATIO] __device__(auto in_idx) {
-                       auto const out_idx = in_idx * BIT_WIDTH_RATIO;
-                       // the lowest order bits are the value, the remainder
-                       // simply matches the sign bit to satisfy the two's
-                       // complement integer representation of negative numbers.
-                       out[out_idx] = in[in_idx];
+  constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType);
+  auto buf =
+    std::make_unique<rmm::device_uvector<DeviceType>>(input.size() * BIT_WIDTH_RATIO, stream, mr);
+
+  auto count = thrust::counting_iterator<size_type>(0);
+
+  thrust::for_each(
+    rmm::exec_policy(stream, mr),
+    count,
+    count + input.size(),
+    [in = input.begin<DeviceType>(), out = buf->data(), BIT_WIDTH_RATIO] __device__(auto in_idx) {
+      auto const out_idx = in_idx * BIT_WIDTH_RATIO;
+      // the lowest order bits are the value, the remainder
+      // simply matches the sign bit to satisfy the two's
+      // complement integer representation of negative numbers.
+      out[out_idx] = in[in_idx];
 #pragma unroll BIT_WIDTH_RATIO - 1
-                       for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
-                         out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
-                       }
-                     });
-    NANOARROW_RETURN_NOT_OK(set_buffer(std::move(buf), fixed_width_data_buffer_idx, tmp.get()));
-  }
-
-  auto contents = input.release();
-  if (contents.null_mask) {
-    NANOARROW_RETURN_NOT_OK(
-      set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
-  }
-
-  if constexpr (std::is_same_v<DeviceType, __int128_t>) {
-    NANOARROW_RETURN_NOT_OK(
-      set_buffer(std::move(contents.data), fixed_width_data_buffer_idx, tmp.get()));
-  }
+      for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
+        out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
+      }
+    });
+  NANOARROW_RETURN_NOT_OK(set_buffer(std::move(buf), fixed_width_data_buffer_idx, tmp.get()));
 
   ArrowArrayMove(tmp.get(), out);
   return NANOARROW_OK;
@@ -389,8 +201,10 @@ int dispatch_to_arrow_device::operator()<numeric::decimal32>(cudf::column&& colu
                                                              ArrowArray* out)
 {
   using DeviceType = int32_t;
-  return decimals_to_arrow<DeviceType>(
-    std::move(column), cudf::detail::max_precision<DeviceType>(), stream, mr, out);
+  NANOARROW_RETURN_NOT_OK(decimals_to_arrow<DeviceType>(column.view(), stream, mr, out));
+  auto contents = column.release();
+  NANOARROW_RETURN_NOT_OK(set_null_mask(contents, out));
+  return NANOARROW_OK;
 }
 
 template <>
@@ -400,8 +214,10 @@ int dispatch_to_arrow_device::operator()<numeric::decimal64>(cudf::column&& colu
                                                              ArrowArray* out)
 {
   using DeviceType = int64_t;
-  return decimals_to_arrow<DeviceType>(
-    std::move(column), cudf::detail::max_precision<DeviceType>(), stream, mr, out);
+  NANOARROW_RETURN_NOT_OK(decimals_to_arrow<DeviceType>(column.view(), stream, mr, out));
+  auto contents = column.release();
+  NANOARROW_RETURN_NOT_OK(set_null_mask(contents, out));
+  return NANOARROW_OK;
 }
 
 template <>
@@ -410,9 +226,12 @@ int dispatch_to_arrow_device::operator()<numeric::decimal128>(cudf::column&& col
                                                               rmm::device_async_resource_ref mr,
                                                               ArrowArray* out)
 {
-  using DeviceType = __int128_t;
-  return decimals_to_arrow<DeviceType>(
-    std::move(column), cudf::detail::max_precision<DeviceType>(), stream, mr, out);
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, column));
+  auto contents = column.release();
+  NANOARROW_RETURN_NOT_OK(set_contents(contents, tmp.get()));
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
 }
 
 template <>
@@ -426,10 +245,7 @@ int dispatch_to_arrow_device::operator()<bool>(cudf::column&& column,
 
   auto bitmask  = bools_to_mask(column.view(), stream, mr);
   auto contents = column.release();
-  if (contents.null_mask) {
-    NANOARROW_RETURN_NOT_OK(
-      set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
-  }
+  NANOARROW_RETURN_NOT_OK(set_null_mask(contents, tmp.get()));
   NANOARROW_RETURN_NOT_OK(
     set_buffer(std::move(bitmask.first), fixed_width_data_buffer_idx, tmp.get()));
 
@@ -459,10 +275,7 @@ int dispatch_to_arrow_device::operator()<cudf::string_view>(cudf::column&& colum
   }
 
   auto contents = column.release();
-  if (contents.null_mask) {
-    NANOARROW_RETURN_NOT_OK(
-      set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
-  }
+  NANOARROW_RETURN_NOT_OK(set_null_mask(contents, tmp.get()));
 
   auto offsets_contents =
     contents.children[cudf::strings_column_view::offsets_column_index]->release();
@@ -496,22 +309,13 @@ int dispatch_to_arrow_device::operator()<cudf::struct_view>(cudf::column&& colum
   NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), column.num_children()));
 
   auto contents = column.release();
-  if (contents.null_mask) {
-    NANOARROW_RETURN_NOT_OK(
-      set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
-  }
+  NANOARROW_RETURN_NOT_OK(set_null_mask(contents, tmp.get()));
 
   for (size_t i = 0; i < size_t(tmp->n_children); ++i) {
     ArrowArray* child_ptr = tmp->children[i];
     auto& child           = contents.children[i];
-    if (child->type().id() == cudf::type_id::EMPTY) {
-      NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(child_ptr, NANOARROW_TYPE_NA));
-      child_ptr->length     = child->size();
-      child_ptr->null_count = child->size();
-    } else {
-      NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
-        child->type(), dispatch_to_arrow_device{}, std::move(*child), stream, mr, child_ptr));
-    }
+    NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+      child->type(), dispatch_to_arrow_device{}, std::move(*child), stream, mr, child_ptr));
   }
 
   ArrowArrayMove(tmp.get(), out);
@@ -529,24 +333,15 @@ int dispatch_to_arrow_device::operator()<cudf::list_view>(cudf::column&& column,
   NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), 1));
 
   auto contents = column.release();
-  if (contents.null_mask) {
-    NANOARROW_RETURN_NOT_OK(
-      set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
-  }
+  NANOARROW_RETURN_NOT_OK(set_null_mask(contents, tmp.get()));
 
   auto offsets_contents =
     contents.children[cudf::lists_column_view::offsets_column_index]->release();
   NANOARROW_RETURN_NOT_OK(set_buffer(std::move(offsets_contents.data), 1, tmp.get()));
 
   auto& child = contents.children[cudf::lists_column_view::child_column_index];
-  if (child->type().id() == cudf::type_id::EMPTY) {
-    NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(tmp->children[0], NANOARROW_TYPE_NA));
-    tmp->children[0]->length     = 0;
-    tmp->children[0]->null_count = 0;
-  } else {
-    NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
-      child->type(), dispatch_to_arrow_device{}, std::move(*child), stream, mr, tmp->children[0]));
-  }
+  NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+    child->type(), dispatch_to_arrow_device{}, std::move(*child), stream, mr, tmp->children[0]));
 
   ArrowArrayMove(tmp.get(), out);
   return NANOARROW_OK;
@@ -566,10 +361,7 @@ int dispatch_to_arrow_device::operator()<cudf::dictionary32>(cudf::column&& colu
   NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateDictionary(tmp.get()));
 
   auto contents = column.release();
-  if (contents.null_mask) {
-    NANOARROW_RETURN_NOT_OK(
-      set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
-  }
+  NANOARROW_RETURN_NOT_OK(set_null_mask(contents, tmp.get()));
 
   auto indices_contents =
     contents.children[cudf::dictionary_column_view::indices_column_index]->release();
@@ -584,6 +376,205 @@ int dispatch_to_arrow_device::operator()<cudf::dictionary32>(cudf::column&& colu
   return NANOARROW_OK;
 }
 
+struct dispatch_to_arrow_device_view {
+  cudf::column_view column;
+  rmm::cuda_stream_view stream;
+  rmm::device_async_resource_ref mr;
+
+  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
+  int operator()(ArrowArray*) const
+  {
+    CUDF_FAIL("Unsupported type for to_arrow_device", cudf::data_type_error);
+  }
+
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
+  int operator()(ArrowArray* out) const
+  {
+    nanoarrow::UniqueArray tmp;
+
+    auto const storage_type = id_to_arrow_storage_type(column.type().id());
+    NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), storage_type, column));
+    NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
+    NANOARROW_RETURN_NOT_OK(set_view_to_buffer(column, tmp.get()));
+
+    ArrowArrayMove(tmp.get(), out);
+    return NANOARROW_OK;
+  }
+
+  int set_buffer_view(void const* in_ptr, size_t size, int64_t i, ArrowArray* out) const
+  {
+    ArrowBuffer* buf = ArrowArrayBuffer(out, i);
+    buf->size_bytes  = size;
+
+    // reset the deallocator to do nothing since this is a non-owning view
+    NANOARROW_RETURN_NOT_OK(ArrowBufferSetAllocator(
+      buf, ArrowBufferDeallocator([](ArrowBufferAllocator*, uint8_t*, int64_t) {}, nullptr)));
+
+    buf->data = const_cast<uint8_t*>(reinterpret_cast<uint8_t const*>(in_ptr));
+    return NANOARROW_OK;
+  }
+
+  int set_null_mask(column_view column, ArrowArray* out) const
+  {
+    if (column.nullable()) {
+      NANOARROW_RETURN_NOT_OK(set_buffer_view(column.null_mask(),
+                                              bitmask_allocation_size_bytes(column.size()),
+                                              validity_buffer_idx,
+                                              out));
+    }
+    return NANOARROW_OK;
+  }
+
+  int set_view_to_buffer(column_view column, ArrowArray* out) const
+  {
+    auto const type_size = cudf::size_of(column.type());
+    return set_buffer_view(column.head<uint8_t>() + (type_size * column.offset()),
+                           column.size() * type_size,
+                           fixed_width_data_buffer_idx,
+                           out);
+  }
+};
+
+template <>
+int dispatch_to_arrow_device_view::operator()<numeric::decimal32>(ArrowArray* out) const
+{
+  using DeviceType = int32_t;
+  NANOARROW_RETURN_NOT_OK(decimals_to_arrow<DeviceType>(column, stream, mr, out));
+  NANOARROW_RETURN_NOT_OK(set_null_mask(column, out));
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device_view::operator()<numeric::decimal64>(ArrowArray* out) const
+{
+  using DeviceType = int64_t;
+  NANOARROW_RETURN_NOT_OK(decimals_to_arrow<DeviceType>(column, stream, mr, out));
+  NANOARROW_RETURN_NOT_OK(set_null_mask(column, out));
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device_view::operator()<numeric::decimal128>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, column));
+  NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
+  NANOARROW_RETURN_NOT_OK(set_view_to_buffer(column, tmp.get()));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device_view::operator()<bool>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_BOOL, column));
+
+  auto bitmask = bools_to_mask(column, stream, mr);
+  NANOARROW_RETURN_NOT_OK(
+    set_buffer(std::move(bitmask.first), fixed_width_data_buffer_idx, tmp.get()));
+  NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device_view::operator()<cudf::string_view>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRING, column));
+
+  if (column.size() == 0) {
+    // https://github.com/rapidsai/cudf/pull/15047#discussion_r1546528552
+    auto zero = std::make_unique<rmm::device_scalar<int32_t>>(0, stream, mr);
+    NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
+    ArrowArrayMove(tmp.get(), out);
+    return NANOARROW_OK;
+  }
+
+  NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
+
+  auto const scv = cudf::strings_column_view(column);
+  NANOARROW_RETURN_NOT_OK(set_view_to_buffer(scv.offsets(), tmp.get()));
+  NANOARROW_RETURN_NOT_OK(
+    set_buffer_view(scv.chars_begin(stream), scv.chars_size(stream), 2, tmp.get()));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device_view::operator()<cudf::list_view>(ArrowArray* out) const;
+
+template <>
+int dispatch_to_arrow_device_view::operator()<cudf::dictionary32>(ArrowArray* out) const;
+
+template <>
+int dispatch_to_arrow_device_view::operator()<cudf::struct_view>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRUCT, column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), column.num_children()));
+  NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
+
+  for (size_t i = 0; i < size_t(tmp->n_children); ++i) {
+    ArrowArray* child_ptr = tmp->children[i];
+    auto const child      = column.child(i);
+    NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+      child.type(), dispatch_to_arrow_device_view{child, stream, mr}, child_ptr));
+  }
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device_view::operator()<cudf::list_view>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_LIST, column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), 1));
+  NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
+
+  auto const lcv = cudf::lists_column_view(column);
+  NANOARROW_RETURN_NOT_OK(set_view_to_buffer(lcv.offsets(), tmp.get()));
+
+  auto child = lcv.child();
+  NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+    child.type(), dispatch_to_arrow_device_view{child, stream, mr}, tmp->children[0]));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device_view::operator()<cudf::dictionary32>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+
+  NANOARROW_RETURN_NOT_OK(initialize_array(
+    tmp.get(),
+    id_to_arrow_type(column.child(cudf::dictionary_column_view::indices_column_index).type().id()),
+    column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateDictionary(tmp.get()));
+  NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
+
+  auto const dcv = cudf::dictionary_column_view(column);
+  NANOARROW_RETURN_NOT_OK(set_view_to_buffer(dcv.indices(), tmp.get()));
+
+  auto keys = dcv.keys();
+  NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+    keys.type(), dispatch_to_arrow_device_view{keys, stream, mr}, tmp->dictionary));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
 struct ArrowDeviceArrayPrivateData {
   ArrowArray parent;
   cudaEvent_t sync_event;
@@ -592,49 +583,38 @@ struct ArrowDeviceArrayPrivateData {
 void ArrowDeviceArrayRelease(ArrowArray* array)
 {
   auto private_data = reinterpret_cast<ArrowDeviceArrayPrivateData*>(array->private_data);
-  cudaEventDestroy(private_data->sync_event);
+  RMM_ASSERT_CUDA_SUCCESS(cudaEventDestroy(private_data->sync_event));
   ArrowArrayRelease(&private_data->parent);
   delete private_data;
   array->release = nullptr;
 }
 
-}  // namespace
-}  // namespace detail
-
-unique_schema_t to_arrow_schema(cudf::table_view const& input,
-                                cudf::host_span<column_metadata const> metadata)
+unique_device_array_t create_device_array(nanoarrow::UniqueArray&& out,
+                                          rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS((metadata.size() == static_cast<std::size_t>(input.num_columns())),
-               "columns' metadata should be equal to the number of columns in table");
-
-  nanoarrow::UniqueSchema result;
-  ArrowSchemaInit(result.get());
-  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(result.get(), input.num_columns()));
-
-  for (int i = 0; i < input.num_columns(); ++i) {
-    auto child = result->children[i];
-    auto col   = input.column(i);
-    ArrowSchemaInit(child);
-    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child, metadata[i].name.c_str()));
-    child->flags = col.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
-
-    if (col.type().id() == cudf::type_id::EMPTY) {
-      NANOARROW_THROW_NOT_OK(ArrowSchemaSetType(child, NANOARROW_TYPE_NA));
-      continue;
-    }
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(out.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr));
 
-    NANOARROW_THROW_NOT_OK(
-      cudf::type_dispatcher(col.type(), detail::dispatch_to_arrow_type{}, col, metadata[i], child));
-  }
+  auto private_data = std::make_unique<detail::ArrowDeviceArrayPrivateData>();
+  CUDF_CUDA_TRY(cudaEventCreate(&private_data->sync_event));
+  CUDF_CUDA_TRY(cudaEventRecord(private_data->sync_event, stream.value()));
 
-  unique_schema_t out(new ArrowSchema, [](ArrowSchema* schema) {
-    if (schema->release != nullptr) { ArrowSchemaRelease(schema); }
-    delete schema;
+  ArrowArrayMove(out.get(), &private_data->parent);
+  unique_device_array_t result(new ArrowDeviceArray, [](ArrowDeviceArray* arr) {
+    if (arr->array.release != nullptr) { ArrowArrayRelease(&arr->array); }
+    delete arr;
   });
-  result.move(out.get());
-  return out;
+  result->device_id          = rmm::get_current_cuda_device().value();
+  result->device_type        = ARROW_DEVICE_CUDA;
+  result->sync_event         = private_data->sync_event;
+  result->array              = private_data->parent;  // makes a shallow copy
+  result->array.private_data = private_data.release();
+  result->array.release      = &detail::ArrowDeviceArrayRelease;
+  return result;
 }
 
+}  // namespace
+
 unique_device_array_t to_arrow_device(cudf::table&& table,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
@@ -650,76 +630,89 @@ unique_device_array_t to_arrow_device(cudf::table&& table,
   for (size_t i = 0; i < cols.size(); ++i) {
     auto child = tmp->children[i];
     auto col   = cols[i].get();
-
-    if (col->type().id() == cudf::type_id::EMPTY) {
-      NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(child, NANOARROW_TYPE_NA));
-      child->length     = col->size();
-      child->null_count = col->size();
-      continue;
-    }
-
     NANOARROW_THROW_NOT_OK(cudf::type_dispatcher(
       col->type(), detail::dispatch_to_arrow_device{}, std::move(*col), stream, mr, child));
   }
 
-  NANOARROW_THROW_NOT_OK(
-    ArrowArrayFinishBuilding(tmp.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr));
+  return create_device_array(std::move(tmp), stream);
+}
 
-  auto private_data = std::make_unique<detail::ArrowDeviceArrayPrivateData>();
-  cudaEventCreate(&private_data->sync_event);
+unique_device_array_t to_arrow_device(cudf::column&& col,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  nanoarrow::UniqueArray tmp;
 
-  auto status = cudaEventRecord(private_data->sync_event, stream);
-  if (status != cudaSuccess) { CUDF_FAIL("could not create event to sync on"); }
+  NANOARROW_THROW_NOT_OK(cudf::type_dispatcher(
+    col.type(), detail::dispatch_to_arrow_device{}, std::move(col), stream, mr, tmp.get()));
 
-  ArrowArrayMove(tmp.get(), &private_data->parent);
-  unique_device_array_t result(new ArrowDeviceArray, [](ArrowDeviceArray* arr) {
-    if (arr->array.release != nullptr) { ArrowArrayRelease(&arr->array); }
-    delete arr;
-  });
-  result->device_id          = rmm::get_current_cuda_device().value();
-  result->device_type        = ARROW_DEVICE_CUDA;
-  result->sync_event         = &private_data->sync_event;
-  result->array              = private_data->parent;
-  result->array.private_data = private_data.release();
-  result->array.release      = &detail::ArrowDeviceArrayRelease;
-  return result;
+  return create_device_array(std::move(tmp), stream);
 }
 
-unique_device_array_t to_arrow_device(cudf::column&& col,
+unique_device_array_t to_arrow_device(cudf::table_view const& table,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
   nanoarrow::UniqueArray tmp;
-  if (col.type().id() == cudf::type_id::EMPTY) {
-    NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_NA));
-    tmp->length     = col.size();
-    tmp->null_count = col.size();
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_STRUCT));
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), table.num_columns()));
+  tmp->length     = table.num_rows();
+  tmp->null_count = 0;
+
+  for (cudf::size_type i = 0; i < table.num_columns(); ++i) {
+    auto child = tmp->children[i];
+    auto col   = table.column(i);
+    NANOARROW_THROW_NOT_OK(cudf::type_dispatcher(
+      col.type(), detail::dispatch_to_arrow_device_view{col, stream, mr}, child));
   }
 
+  return create_device_array(std::move(tmp), stream);
+}
+
+unique_device_array_t to_arrow_device(cudf::column_view const& col,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  nanoarrow::UniqueArray tmp;
+
   NANOARROW_THROW_NOT_OK(cudf::type_dispatcher(
-    col.type(), detail::dispatch_to_arrow_device{}, std::move(col), stream, mr, tmp.get()));
+    col.type(), detail::dispatch_to_arrow_device_view{col, stream, mr}, tmp.get()));
 
-  NANOARROW_THROW_NOT_OK(
-    ArrowArrayFinishBuilding(tmp.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr));
+  return create_device_array(std::move(tmp), stream);
+}
 
-  auto private_data = std::make_unique<detail::ArrowDeviceArrayPrivateData>();
-  cudaEventCreate(&private_data->sync_event);
+}  // namespace detail
 
-  auto status = cudaEventRecord(private_data->sync_event, stream);
-  if (status != cudaSuccess) { CUDF_FAIL("could not create event to sync on"); }
+unique_device_array_t to_arrow_device(cudf::table&& table,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::to_arrow_device(std::move(table), stream, mr);
+}
 
-  ArrowArrayMove(tmp.get(), &private_data->parent);
-  unique_device_array_t result(new ArrowDeviceArray, [](ArrowDeviceArray* arr) {
-    if (arr->array.release != nullptr) { ArrowArrayRelease(&arr->array); }
-    delete arr;
-  });
-  result->device_id          = rmm::get_current_cuda_device().value();
-  result->device_type        = ARROW_DEVICE_CUDA;
-  result->sync_event         = &private_data->sync_event;
-  result->array              = private_data->parent;
-  result->array.private_data = private_data.release();
-  result->array.release      = &detail::ArrowDeviceArrayRelease;
-  return result;
+unique_device_array_t to_arrow_device(cudf::column&& col,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::to_arrow_device(std::move(col), stream, mr);
+}
+
+unique_device_array_t to_arrow_device(cudf::table_view const& table,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::to_arrow_device(table, stream, mr);
 }
 
+unique_device_array_t to_arrow_device(cudf::column_view const& col,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::to_arrow_device(col, stream, mr);
+}
 }  // namespace cudf
diff --git a/cpp/src/interop/to_arrow_schema.cpp b/cpp/src/interop/to_arrow_schema.cpp
new file mode 100644
index 00000000000..6f943593dce
--- /dev/null
+++ b/cpp/src/interop/to_arrow_schema.cpp
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "to_arrow_utilities.hpp"
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/interop.hpp>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/interop/detail/arrow.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <nanoarrow/nanoarrow.h>
+#include <nanoarrow/nanoarrow.hpp>
+
+namespace cudf {
+namespace detail {
+namespace {
+
+struct dispatch_to_arrow_type {
+  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
+  int operator()(column_view, column_metadata const&, ArrowSchema*)
+  {
+    CUDF_FAIL("Unsupported type for to_arrow_schema", cudf::data_type_error);
+  }
+
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
+  int operator()(column_view input_view, column_metadata const&, ArrowSchema* out)
+  {
+    cudf::type_id id = input_view.type().id();
+    switch (id) {
+      case cudf::type_id::TIMESTAMP_SECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_SECOND, nullptr);
+      case cudf::type_id::TIMESTAMP_MILLISECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr);
+      case cudf::type_id::TIMESTAMP_MICROSECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MICRO, nullptr);
+      case cudf::type_id::TIMESTAMP_NANOSECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_NANO, nullptr);
+      case cudf::type_id::DURATION_SECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_SECOND, nullptr);
+      case cudf::type_id::DURATION_MILLISECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_MILLI, nullptr);
+      case cudf::type_id::DURATION_MICROSECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_MICRO, nullptr);
+      case cudf::type_id::DURATION_NANOSECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_NANO, nullptr);
+      default: return ArrowSchemaSetType(out, id_to_arrow_type(id));
+    }
+  }
+};
+
+template <typename DeviceType>
+int decimals_to_arrow(column_view input, ArrowSchema* out)
+{
+  // Arrow doesn't support decimal32/decimal64 currently. decimal128
+  // is the smallest that arrow supports besides float32/float64 so we
+  // upcast to decimal128.
+  return ArrowSchemaSetTypeDecimal(out,
+                                   NANOARROW_TYPE_DECIMAL128,
+                                   cudf::detail::max_precision<DeviceType>(),
+                                   -input.type().scale());
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<numeric::decimal32>(column_view input,
+                                                           column_metadata const&,
+                                                           ArrowSchema* out)
+{
+  using DeviceType = int32_t;
+  return decimals_to_arrow<DeviceType>(input, out);
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<numeric::decimal64>(column_view input,
+                                                           column_metadata const&,
+                                                           ArrowSchema* out)
+{
+  using DeviceType = int64_t;
+  return decimals_to_arrow<DeviceType>(input, out);
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<numeric::decimal128>(column_view input,
+                                                            column_metadata const&,
+                                                            ArrowSchema* out)
+{
+  using DeviceType = __int128_t;
+  return decimals_to_arrow<DeviceType>(input, out);
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::string_view>(column_view input,
+                                                          column_metadata const&,
+                                                          ArrowSchema* out)
+{
+  return ArrowSchemaSetType(out, NANOARROW_TYPE_STRING);
+}
+
+// these forward declarations are needed due to the recursive calls to them
+// inside their definitions and in struct_vew for handling children
+template <>
+int dispatch_to_arrow_type::operator()<cudf::list_view>(column_view input,
+                                                        column_metadata const& metadata,
+                                                        ArrowSchema* out);
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::dictionary32>(column_view input,
+                                                           column_metadata const& metadata,
+                                                           ArrowSchema* out);
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::struct_view>(column_view input,
+                                                          column_metadata const& metadata,
+                                                          ArrowSchema* out)
+{
+  CUDF_EXPECTS(metadata.children_meta.size() == static_cast<std::size_t>(input.num_children()),
+               "Number of field names and number of children doesn't match\n");
+
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeStruct(out, input.num_children()));
+  for (int i = 0; i < input.num_children(); ++i) {
+    auto child = out->children[i];
+    auto col   = input.child(i);
+    ArrowSchemaInit(child);
+    NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(child, metadata.children_meta[i].name.c_str()));
+
+    child->flags = col.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
+
+    NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+      col.type(), detail::dispatch_to_arrow_type{}, col, metadata.children_meta[i], child));
+  }
+
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::list_view>(column_view input,
+                                                        column_metadata const& metadata,
+                                                        ArrowSchema* out)
+{
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(out, NANOARROW_TYPE_LIST));
+  auto child = input.child(cudf::lists_column_view::child_column_index);
+  ArrowSchemaInit(out->children[0]);
+  auto child_meta =
+    metadata.children_meta.empty() ? column_metadata{"element"} : metadata.children_meta[0];
+
+  out->flags = input.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(out->children[0], child_meta.name.c_str()));
+  out->children[0]->flags = child.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
+  return cudf::type_dispatcher(
+    child.type(), detail::dispatch_to_arrow_type{}, child, child_meta, out->children[0]);
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::dictionary32>(column_view input,
+                                                           column_metadata const& metadata,
+                                                           ArrowSchema* out)
+{
+  cudf::dictionary_column_view dview{input};
+
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(out, id_to_arrow_type(dview.indices().type().id())));
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateDictionary(out));
+  ArrowSchemaInit(out->dictionary);
+
+  auto dict_keys = dview.keys();
+  return cudf::type_dispatcher(
+    dict_keys.type(),
+    detail::dispatch_to_arrow_type{},
+    dict_keys,
+    metadata.children_meta.empty() ? column_metadata{"keys"} : metadata.children_meta[0],
+    out->dictionary);
+}
+}  // namespace
+}  // namespace detail
+
+unique_schema_t to_arrow_schema(cudf::table_view const& input,
+                                cudf::host_span<column_metadata const> metadata)
+{
+  CUDF_EXPECTS((metadata.size() == static_cast<std::size_t>(input.num_columns())),
+               "columns' metadata should be equal to the number of columns in table");
+
+  nanoarrow::UniqueSchema result;
+  ArrowSchemaInit(result.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(result.get(), input.num_columns()));
+
+  for (int i = 0; i < input.num_columns(); ++i) {
+    auto child = result->children[i];
+    auto col   = input.column(i);
+    ArrowSchemaInit(child);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child, metadata[i].name.c_str()));
+    child->flags = col.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
+
+    NANOARROW_THROW_NOT_OK(
+      cudf::type_dispatcher(col.type(), detail::dispatch_to_arrow_type{}, col, metadata[i], child));
+  }
+
+  unique_schema_t out(new ArrowSchema, [](ArrowSchema* schema) {
+    if (schema->release != nullptr) { ArrowSchemaRelease(schema); }
+    delete schema;
+  });
+  result.move(out.get());
+  return out;
+}
+
+}  // namespace cudf
diff --git a/cpp/src/interop/to_arrow_utilities.cpp b/cpp/src/interop/to_arrow_utilities.cpp
new file mode 100644
index 00000000000..04d17847273
--- /dev/null
+++ b/cpp/src/interop/to_arrow_utilities.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "to_arrow_utilities.hpp"
+
+#include <cudf/utilities/error.hpp>
+
+namespace cudf {
+namespace detail {
+
+ArrowType id_to_arrow_type(cudf::type_id id)
+{
+  switch (id) {
+    case cudf::type_id::BOOL8: return NANOARROW_TYPE_BOOL;
+    case cudf::type_id::INT8: return NANOARROW_TYPE_INT8;
+    case cudf::type_id::INT16: return NANOARROW_TYPE_INT16;
+    case cudf::type_id::INT32: return NANOARROW_TYPE_INT32;
+    case cudf::type_id::INT64: return NANOARROW_TYPE_INT64;
+    case cudf::type_id::UINT8: return NANOARROW_TYPE_UINT8;
+    case cudf::type_id::UINT16: return NANOARROW_TYPE_UINT16;
+    case cudf::type_id::UINT32: return NANOARROW_TYPE_UINT32;
+    case cudf::type_id::UINT64: return NANOARROW_TYPE_UINT64;
+    case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT;
+    case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE;
+    case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32;
+    default: CUDF_FAIL("Unsupported type_id conversion to arrow type", cudf::data_type_error);
+  }
+}
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/interop/to_arrow_utilities.hpp b/cpp/src/interop/to_arrow_utilities.hpp
new file mode 100644
index 00000000000..3c01c726a7b
--- /dev/null
+++ b/cpp/src/interop/to_arrow_utilities.hpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/types.hpp>
+
+#include <nanoarrow/nanoarrow_types.h>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief Map cudf column type id to ArrowType id
+ *
+ * @param id Column type id
+ * @return ArrowType id
+ */
+ArrowType id_to_arrow_type(cudf::type_id id);
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index 16aab53a249..d6eae8dece1 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -327,14 +327,16 @@ TEST_F(ToArrowDeviceTest, EmptyTable)
   auto got_arrow_schema = cudf::to_arrow_schema(table->view(), meta);
 
   compare_schemas(schema.get(), got_arrow_schema.get());
-  ArrowSchemaRelease(got_arrow_schema.get());
 
-  auto got_arrow_device = cudf::to_arrow_device(std::move(*table));
+  auto got_arrow_device = cudf::to_arrow_device(table->view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_device->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_device->device_type);
+  compare_arrays(schema.get(), arr.get(), &got_arrow_device->array);
 
+  got_arrow_device = cudf::to_arrow_device(std::move(*table));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_device->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_device->device_type);
   compare_arrays(schema.get(), arr.get(), &got_arrow_device->array);
-  ArrowArrayRelease(&got_arrow_device->array);
 }
 
 TEST_F(ToArrowDeviceTest, DateTimeTable)
@@ -358,10 +360,9 @@ TEST_F(ToArrowDeviceTest, DateTimeTable)
   expected_schema->children[0]->flags = 0;
 
   compare_schemas(expected_schema.get(), got_arrow_schema.get());
-  ArrowSchemaRelease(got_arrow_schema.get());
 
   auto data_ptr        = input.get_column(0).view().data<int64_t>();
-  auto got_arrow_array = cudf::to_arrow_device(std::move(input));
+  auto got_arrow_array = cudf::to_arrow_device(input.view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
 
@@ -377,7 +378,21 @@ TEST_F(ToArrowDeviceTest, DateTimeTable)
   EXPECT_EQ(nullptr, got_arrow_array->array.children[0]->buffers[0]);
   EXPECT_EQ(data_ptr, got_arrow_array->array.children[0]->buffers[1]);
 
-  ArrowArrayRelease(&got_arrow_array->array);
+  got_arrow_array = cudf::to_arrow_device(std::move(input));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+
+  EXPECT_EQ(data.size(), got_arrow_array->array.length);
+  EXPECT_EQ(0, got_arrow_array->array.null_count);
+  EXPECT_EQ(0, got_arrow_array->array.offset);
+  EXPECT_EQ(1, got_arrow_array->array.n_children);
+  EXPECT_EQ(nullptr, got_arrow_array->array.buffers[0]);
+
+  EXPECT_EQ(data.size(), got_arrow_array->array.children[0]->length);
+  EXPECT_EQ(0, got_arrow_array->array.children[0]->null_count);
+  EXPECT_EQ(0, got_arrow_array->array.children[0]->offset);
+  EXPECT_EQ(nullptr, got_arrow_array->array.children[0]->buffers[0]);
+  EXPECT_EQ(data_ptr, got_arrow_array->array.children[0]->buffers[1]);
 }
 
 TYPED_TEST(ToArrowDeviceTestDurationsTest, DurationTable)
@@ -415,10 +430,9 @@ TYPED_TEST(ToArrowDeviceTestDurationsTest, DurationTable)
   auto got_arrow_schema =
     cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
   BaseArrowFixture::compare_schemas(expected_schema.get(), got_arrow_schema.get());
-  ArrowSchemaRelease(got_arrow_schema.get());
 
   auto data_ptr        = input.get_column(0).view().data<int64_t>();
-  auto got_arrow_array = cudf::to_arrow_device(std::move(input));
+  auto got_arrow_array = cudf::to_arrow_device(input.view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
 
@@ -434,7 +448,21 @@ TYPED_TEST(ToArrowDeviceTestDurationsTest, DurationTable)
   EXPECT_EQ(nullptr, got_arrow_array->array.children[0]->buffers[0]);
   EXPECT_EQ(data_ptr, got_arrow_array->array.children[0]->buffers[1]);
 
-  ArrowArrayRelease(&got_arrow_array->array);
+  got_arrow_array = cudf::to_arrow_device(std::move(input));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+
+  EXPECT_EQ(data.size(), got_arrow_array->array.length);
+  EXPECT_EQ(0, got_arrow_array->array.null_count);
+  EXPECT_EQ(0, got_arrow_array->array.offset);
+  EXPECT_EQ(1, got_arrow_array->array.n_children);
+  EXPECT_EQ(nullptr, got_arrow_array->array.buffers[0]);
+
+  EXPECT_EQ(data.size(), got_arrow_array->array.children[0]->length);
+  EXPECT_EQ(0, got_arrow_array->array.children[0]->null_count);
+  EXPECT_EQ(0, got_arrow_array->array.children[0]->offset);
+  EXPECT_EQ(nullptr, got_arrow_array->array.children[0]->buffers[0]);
+  EXPECT_EQ(data_ptr, got_arrow_array->array.children[0]->buffers[1]);
 }
 
 TEST_F(ToArrowDeviceTest, NestedList)
@@ -471,7 +499,6 @@ TEST_F(ToArrowDeviceTest, NestedList)
   auto got_arrow_schema =
     cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
   compare_schemas(expected_schema.get(), got_arrow_schema.get());
-  ArrowSchemaRelease(got_arrow_schema.get());
 
   nanoarrow::UniqueArray expected_array;
   EXPECT_EQ(NANOARROW_OK,
@@ -487,12 +514,15 @@ TEST_F(ToArrowDeviceTest, NestedList)
   NANOARROW_THROW_NOT_OK(
     ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
-  auto got_arrow_array = cudf::to_arrow_device(std::move(input));
+  auto got_arrow_array = cudf::to_arrow_device(input.view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
 
+  got_arrow_array = cudf::to_arrow_device(std::move(input));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
   compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
-  ArrowArrayRelease(&got_arrow_array->array);
 }
 
 TEST_F(ToArrowDeviceTest, StructColumn)
@@ -588,7 +618,6 @@ TEST_F(ToArrowDeviceTest, StructColumn)
   auto got_arrow_schema =
     cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{metadata});
   compare_schemas(expected_schema.get(), got_arrow_schema.get());
-  ArrowSchemaRelease(got_arrow_schema.get());
 
   nanoarrow::UniqueArray expected_array;
   NANOARROW_THROW_NOT_OK(
@@ -629,12 +658,15 @@ TEST_F(ToArrowDeviceTest, StructColumn)
   NANOARROW_THROW_NOT_OK(
     ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
-  auto got_arrow_array = cudf::to_arrow_device(std::move(input));
+  auto got_arrow_array = cudf::to_arrow_device(input.view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
 
+  got_arrow_array = cudf::to_arrow_device(std::move(input));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
   compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
-  ArrowArrayRelease(&got_arrow_array->array);
 }
 
 template <typename T>
@@ -665,7 +697,6 @@ TEST_F(ToArrowDeviceTest, FixedPoint64Table)
     auto got_arrow_schema =
       cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
     compare_schemas(expected_schema.get(), got_arrow_schema.get());
-    ArrowSchemaRelease(got_arrow_schema.get());
 
     auto result_dev_data = std::make_unique<rmm::device_uvector<int64_t>>(
       expect_data.size(), cudf::get_default_stream());
@@ -700,12 +731,15 @@ TEST_F(ToArrowDeviceTest, FixedPoint64Table)
     NANOARROW_THROW_NOT_OK(
       ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
-    auto got_arrow_array = cudf::to_arrow_device(std::move(input));
+    auto got_arrow_array = cudf::to_arrow_device(input.view());
     ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
     ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
 
+    got_arrow_array = cudf::to_arrow_device(std::move(input));
+    ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+    ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
     compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
-    ArrowArrayRelease(&got_arrow_array->array);
   }
 }
 
@@ -734,7 +768,6 @@ TEST_F(ToArrowDeviceTest, FixedPoint128Table)
     auto got_arrow_schema =
       cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
     compare_schemas(expected_schema.get(), got_arrow_schema.get());
-    ArrowSchemaRelease(got_arrow_schema.get());
 
     nanoarrow::UniqueArray expected_array;
     NANOARROW_THROW_NOT_OK(
@@ -745,11 +778,14 @@ TEST_F(ToArrowDeviceTest, FixedPoint128Table)
     NANOARROW_THROW_NOT_OK(
       ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
-    auto got_arrow_array = cudf::to_arrow_device(std::move(input));
+    auto got_arrow_array = cudf::to_arrow_device(input.view());
     EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
     EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
 
+    got_arrow_array = cudf::to_arrow_device(std::move(input));
+    EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+    EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
     compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
-    ArrowArrayRelease(&got_arrow_array->array);
   }
 }

From a2c81e71fd9a7bbb0a89eee8a456d0066fa3ecbb Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 22 Apr 2024 08:29:31 -0400
Subject: [PATCH 109/272] Large strings support in cudf::merge (#15374)

Enable large strings support in `cudf::merge`.
Simplifies the strings specialization to use the gather-based strings factory function which is already optimized for long strings and is now appropriately enabled for large strings.
Also moved source from `include/cudf/strings/detail/merge.cuh` to `src/strings/merge/merge.cu` file since the template implemenation was not actually required.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Bradley Dice (https://github.com/bdice)
  - https://github.com/nvdbaranec
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15374
---
 cpp/CMakeLists.txt                            |   1 +
 cpp/benchmarks/CMakeLists.txt                 |   4 +-
 cpp/benchmarks/merge/merge_strings.cpp        |  64 ++++++++++
 cpp/include/cudf/strings/detail/merge.cuh     | 111 ------------------
 cpp/include/cudf/strings/detail/merge.hpp     |  41 +++++++
 .../cudf/strings/detail/strings_children.cuh  |  26 ++--
 .../detail/strings_column_factories.cuh       |  29 ++---
 cpp/src/merge/merge.cu                        |  16 +--
 cpp/src/strings/merge/merge.cu                |  74 ++++++++++++
 cpp/tests/merge/merge_string_test.cpp         |  57 +++++++++
 10 files changed, 267 insertions(+), 156 deletions(-)
 create mode 100644 cpp/benchmarks/merge/merge_strings.cpp
 delete mode 100644 cpp/include/cudf/strings/detail/merge.cuh
 create mode 100644 cpp/include/cudf/strings/detail/merge.hpp
 create mode 100644 cpp/src/strings/merge/merge.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 3c7e10c9bc4..60d0094efac 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -586,6 +586,7 @@ add_library(
   src/strings/filling/fill.cu
   src/strings/filter_chars.cu
   src/strings/like.cu
+  src/strings/merge/merge.cu
   src/strings/padding.cu
   src/strings/regex/regcomp.cpp
   src/strings/regex/regexec.cpp
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 2c78a31f0f8..d36ecfd3a21 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -236,7 +236,9 @@ ConfigureNVBench(HASHING_NVBENCH hashing/hash.cpp)
 # ##################################################################################################
 # * merge benchmark -------------------------------------------------------------------------------
 ConfigureBench(MERGE_BENCH merge/merge.cpp)
-ConfigureNVBench(MERGE_NVBENCH merge/merge_structs.cpp merge/merge_lists.cpp)
+ConfigureNVBench(
+  MERGE_NVBENCH merge/merge_lists.cpp merge/merge_structs.cpp merge/merge_strings.cpp
+)
 
 # ##################################################################################################
 # * null_mask benchmark ---------------------------------------------------------------------------
diff --git a/cpp/benchmarks/merge/merge_strings.cpp b/cpp/benchmarks/merge/merge_strings.cpp
new file mode 100644
index 00000000000..3d0f1865490
--- /dev/null
+++ b/cpp/benchmarks/merge/merge_strings.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/merge.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+void nvbench_merge_strings(nvbench::state& state)
+{
+  auto stream = cudf::get_default_stream();
+
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  if (static_cast<std::size_t>(2 * num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const table_profile =
+    data_profile_builder()
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
+      .no_validity();
+  auto const source_tables = create_random_table(
+    {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, table_profile);
+
+  auto const sorted_lhs = cudf::sort(cudf::table_view({source_tables->view().column(0)}));
+  auto const sorted_rhs = cudf::sort(cudf::table_view({source_tables->view().column(1)}));
+  auto const lhs        = sorted_lhs->view().column(0);
+  auto const rhs        = sorted_rhs->view().column(0);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto chars_size = cudf::strings_column_view(lhs).chars_size(stream) +
+                    cudf::strings_column_view(rhs).chars_size(stream);
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);  // all bytes are written
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    [[maybe_unused]] auto result = cudf::merge(
+      {cudf::table_view({lhs}), cudf::table_view({rhs})}, {0}, {cudf::order::ASCENDING});
+  });
+}
+
+NVBENCH_BENCH(nvbench_merge_strings)
+  .set_name("merge_strings")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh
deleted file mode 100644
index 457c2b7f740..00000000000
--- a/cpp/include/cudf/strings/detail/merge.cuh
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/merge.hpp>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
-#include <cudf/strings/string_view.cuh>
-#include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
-
-#include <cuda/functional>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/tuple.h>
-
-namespace cudf {
-namespace strings {
-namespace detail {
-/**
- * @brief Merges two strings columns.
- *
- * Caller must set the validity mask in the output column.
- *
- * @tparam row_order_iterator This must be an iterator for type thrust::tuple<side,size_type>.
- *
- * @param lhs First column.
- * @param rhs Second column.
- * @param row_order Indexes for each column.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
- */
-template <typename index_type, typename row_order_iterator>
-std::unique_ptr<column> merge(strings_column_view const& lhs,
-                              strings_column_view const& rhs,
-                              row_order_iterator begin,
-                              row_order_iterator end,
-                              rmm::cuda_stream_view stream,
-                              rmm::device_async_resource_ref mr)
-{
-  using cudf::detail::side;
-  size_type strings_count = static_cast<size_type>(std::distance(begin, end));
-  if (strings_count == 0) return make_empty_column(type_id::STRING);
-
-  auto lhs_column = column_device_view::create(lhs.parent(), stream);
-  auto d_lhs      = *lhs_column;
-  auto rhs_column = column_device_view::create(rhs.parent(), stream);
-  auto d_rhs      = *rhs_column;
-
-  // caller will set the null mask
-  rmm::device_buffer null_mask{0, stream, mr};
-  size_type null_count = lhs.null_count() + rhs.null_count();
-  if (null_count > 0)
-    null_mask = cudf::detail::create_null_mask(strings_count, mask_state::ALL_VALID, stream, mr);
-
-  // build offsets column
-  auto offsets_transformer =
-    cuda::proclaim_return_type<size_type>([d_lhs, d_rhs] __device__(auto index_pair) {
-      auto const [side, index] = index_pair;
-      if (side == side::LEFT ? d_lhs.is_null(index) : d_rhs.is_null(index)) return 0;
-      auto d_str =
-        side == side::LEFT ? d_lhs.element<string_view>(index) : d_rhs.element<string_view>(index);
-      return d_str.size_bytes();
-    });
-  auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer);
-  auto [offsets_column, bytes] = cudf::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_offsets = offsets_column->view().template data<int32_t>();
-
-  // create the chars column
-  rmm::device_uvector<char> chars(bytes, stream, mr);
-  auto d_chars = chars.data();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     strings_count,
-                     [d_lhs, d_rhs, begin, d_offsets, d_chars] __device__(size_type idx) {
-                       auto const [side, index] = begin[idx];
-                       if (side == side::LEFT ? d_lhs.is_null(index) : d_rhs.is_null(index)) return;
-                       auto d_str = side == side::LEFT ? d_lhs.element<string_view>(index)
-                                                       : d_rhs.element<string_view>(index);
-                       memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes());
-                     });
-
-  return make_strings_column(
-    strings_count, std::move(offsets_column), chars.release(), null_count, std::move(null_mask));
-}
-
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
diff --git a/cpp/include/cudf/strings/detail/merge.hpp b/cpp/include/cudf/strings/detail/merge.hpp
new file mode 100644
index 00000000000..35fd9c0593d
--- /dev/null
+++ b/cpp/include/cudf/strings/detail/merge.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/detail/merge.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf ::strings ::detail {
+/**
+ * @brief Merges two strings columns
+ *
+ * @param lhs First column
+ * @param rhs Second column
+ * @param row_order Indices for each column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
+ */
+std::unique_ptr<column> merge(strings_column_view const& lhs,
+                              strings_column_view const& rhs,
+                              cudf::detail::index_vector const& row_order,
+                              rmm::cuda_stream_view stream,
+                              rmm::device_async_resource_ref mr);
+
+}  // namespace cudf::strings::detail
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index 7136df325f4..35812c0573d 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -164,22 +164,22 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
     });
   auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
   // Use the sizes-to-offsets iterator to compute the total number of elements
-  auto const total_elements =
+  auto const total_bytes =
     cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream);
 
-  // TODO: replace exception with if-statement when enabling creating INT64 offsets
-  CUDF_EXPECTS(total_elements <= size_type_max,
-               "Size of output exceeds the character size limit",
+  auto const threshold = get_offset64_threshold();
+  CUDF_EXPECTS(is_large_strings_enabled() || (total_bytes < threshold),
+               "Size of output exceeds the column size limit",
                std::overflow_error);
-  // if (total_elements >= get_offset64_threshold()) {
-  //   // recompute as int64 offsets when above the threshold
-  //   offsets_column = make_numeric_column(
-  //     data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
-  //   auto d_offsets64 = offsets_column->mutable_view().template data<int64_t>();
-  //   sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream);
-  // }
-
-  return std::pair(std::move(offsets_column), total_elements);
+  if (total_bytes >= get_offset64_threshold()) {
+    // recompute as int64 offsets when above the threshold
+    offsets_column = make_numeric_column(
+      data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
+    auto d_offsets64 = offsets_column->mutable_view().template data<int64_t>();
+    cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream);
+  }
+
+  return std::pair(std::move(offsets_column), total_bytes);
 }
 
 }  // namespace detail
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index 079b6a73e0b..a3221038eed 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -86,9 +86,10 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
       return (item.first != nullptr ? static_cast<size_type>(item.second) : size_type{0});
     });
   auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer);
-  auto [offsets_column, bytes] = cudf::detail::make_offsets_child_column(
+  auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
     offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto offsets_view = offsets_column->view();
+  auto const d_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
 
   // create null mask
   auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; };
@@ -98,11 +99,10 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
     (null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr};
 
   // build chars column
-  auto chars_data = [offsets_view, bytes = bytes, begin, strings_count, null_count, stream, mr] {
+  auto chars_data = [d_offsets, bytes = bytes, begin, strings_count, null_count, stream, mr] {
     auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1);
     // use a character-parallel kernel for long string lengths
     if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) {
-      auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets_view);
       auto const str_begin = thrust::make_transform_iterator(
         begin, cuda::proclaim_return_type<string_view>([] __device__(auto ip) {
           return string_view{ip.first, ip.second};
@@ -121,12 +121,11 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
       auto d_chars    = chars_data.data();
       auto copy_chars = [d_chars] __device__(auto item) {
         string_index_pair const str = thrust::get<0>(item);
-        size_type const offset      = thrust::get<1>(item);
+        int64_t const offset        = thrust::get<1>(item);
         if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second);
       };
       thrust::for_each_n(rmm::exec_policy(stream),
-                         thrust::make_zip_iterator(
-                           thrust::make_tuple(begin, offsets_view.template begin<size_type>())),
+                         thrust::make_zip_iterator(thrust::make_tuple(begin, d_offsets)),
                          strings_count,
                          copy_chars);
       return chars_data;
@@ -168,21 +167,15 @@ std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
 {
   CUDF_FUNC_RANGE();
   size_type strings_count = thrust::distance(offsets_begin, offsets_end) - 1;
-  size_type bytes         = std::distance(chars_begin, chars_end) * sizeof(char);
-  if (strings_count == 0) return make_empty_column(type_id::STRING);
+  if (strings_count == 0) { return make_empty_column(type_id::STRING); }
 
+  int64_t const bytes = std::distance(chars_begin, chars_end) * sizeof(char);
   CUDF_EXPECTS(bytes >= 0, "invalid offsets data");
 
   // build offsets column -- this is the number of strings + 1
-  auto offsets_column = make_numeric_column(
-    data_type{type_to_id<size_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
-  auto offsets_view = offsets_column->mutable_view();
-  thrust::transform(rmm::exec_policy(stream),
-                    offsets_begin,
-                    offsets_end,
-                    offsets_view.data<int32_t>(),
-                    cuda::proclaim_return_type<int32_t>(
-                      [] __device__(auto offset) { return static_cast<int32_t>(offset); }));
+  auto [offsets_column, computed_bytes] =
+    cudf::strings::detail::make_offsets_child_column(offsets_begin, offsets_end, stream, mr);
+  CUDF_EXPECTS(bytes == computed_bytes, "unexpected byte count");
 
   // build chars column
   rmm::device_uvector<char> chars_data(bytes, stream, mr);
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 4463b16df78..5a3be259ed9 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -27,7 +27,7 @@
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/lists/detail/concatenate.hpp>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/strings/detail/merge.cuh>
+#include <cudf/strings/detail/merge.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table.hpp>
@@ -434,18 +434,8 @@ std::unique_ptr<column> column_merger::operator()<cudf::string_view>(
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr) const
 {
-  auto column = strings::detail::merge<index_type>(strings_column_view(lcol),
-                                                   strings_column_view(rcol),
-                                                   row_order_.begin(),
-                                                   row_order_.end(),
-                                                   stream,
-                                                   mr);
-  if (lcol.has_nulls() || rcol.has_nulls()) {
-    auto merged_view = column->mutable_view();
-    materialize_bitmask(
-      lcol, rcol, merged_view.null_mask(), merged_view.size(), row_order_.data(), stream);
-  }
-  return column;
+  return strings::detail::merge(
+    strings_column_view(lcol), strings_column_view(rcol), row_order_, stream, mr);
 }
 
 // specialization for dictionary
diff --git a/cpp/src/strings/merge/merge.cu b/cpp/src/strings/merge/merge.cu
new file mode 100644
index 00000000000..28e171f157e
--- /dev/null
+++ b/cpp/src/strings/merge/merge.cu
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/strings/detail/merge.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+std::unique_ptr<column> merge(strings_column_view const& lhs,
+                              strings_column_view const& rhs,
+                              cudf::detail::index_vector const& row_order,
+                              rmm::cuda_stream_view stream,
+                              rmm::device_async_resource_ref mr)
+{
+  using cudf::detail::side;
+  if (row_order.is_empty()) { return make_empty_column(type_id::STRING); }
+  auto const strings_count = static_cast<cudf::size_type>(row_order.size());
+
+  auto const lhs_column = column_device_view::create(lhs.parent(), stream);
+  auto const d_lhs      = *lhs_column;
+  auto const rhs_column = column_device_view::create(rhs.parent(), stream);
+  auto const d_rhs      = *rhs_column;
+
+  auto const begin = row_order.begin();
+
+  // build vector of strings
+  rmm::device_uvector<string_index_pair> indices(strings_count, stream);
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(strings_count),
+                    indices.begin(),
+                    [d_lhs, d_rhs, begin] __device__(size_type idx) {
+                      auto const [s, index] = begin[idx];
+                      if (s == side::LEFT ? d_lhs.is_null(index) : d_rhs.is_null(index)) {
+                        return string_index_pair{nullptr, 0};
+                      }
+                      auto d_str = (s == side::LEFT) ? d_lhs.element<string_view>(index)
+                                                     : d_rhs.element<string_view>(index);
+                      return d_str.size_bytes() == 0
+                               ? string_index_pair{"", 0}  // ensures empty != null
+                               : string_index_pair{d_str.data(), d_str.size_bytes()};
+                    });
+
+  // convert vector into strings column
+  return make_strings_column(indices.begin(), indices.end(), stream, mr);
+}
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/tests/merge/merge_string_test.cpp b/cpp/tests/merge/merge_string_test.cpp
index 28179a7341c..d7368d31944 100644
--- a/cpp/tests/merge/merge_string_test.cpp
+++ b/cpp/tests/merge/merge_string_test.cpp
@@ -411,3 +411,60 @@ TYPED_TEST(MergeStringTest, Merge2StringKeyNullColumns)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view2, output_column_view2);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view3, output_column_view3);
 }
+
+class MergeLargeStringsTest : public cudf::test::BaseFixture {};
+
+TEST_F(MergeLargeStringsTest, MergeLargeStrings)
+{
+  CUDF_TEST_ENABLE_LARGE_STRINGS();
+  auto itr = thrust::constant_iterator<std::string_view>(
+    "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY");                      // 50 bytes
+  auto const input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000);  // 250MB
+  auto input_views = std::vector<cudf::table_view>();
+  auto const view  = cudf::table_view({input});
+  std::vector<cudf::size_type> splits;
+  int const multiplier = 10;
+  for (int i = 0; i < multiplier; ++i) {  // 2500MB > 2GB
+    input_views.push_back(view);
+    splits.push_back(view.num_rows() * (i + 1));
+  }
+  splits.pop_back();  // remove last entry
+  auto const column_order    = std::vector<cudf::order>{cudf::order::ASCENDING};
+  auto const null_precedence = std::vector<cudf::null_order>{cudf::null_order::AFTER};
+
+  auto result = cudf::merge(input_views, {0}, column_order, null_precedence);
+  auto sv     = cudf::strings_column_view(result->view().column(0));
+  EXPECT_EQ(sv.size(), view.num_rows() * multiplier);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+
+  auto sliced = cudf::split(sv.parent(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+
+  // also test with large strings column as input
+  input_views.clear();
+  input_views.push_back(view);            // regular column
+  input_views.push_back(result->view());  // large column
+  result = cudf::merge(input_views, {0}, column_order, null_precedence);
+  sv     = cudf::strings_column_view(result->view().column(0));
+  EXPECT_EQ(sv.size(), view.num_rows() * (multiplier + 1));
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+  splits.push_back(view.num_rows() * multiplier);
+  sliced = cudf::split(sv.parent(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+
+  // also check merge still returns 32-bit offsets for regular columns
+  input_views.clear();
+  input_views.push_back(view);
+  input_views.push_back(view);
+  result = cudf::merge(input_views, {0}, column_order, null_precedence);
+  sv     = cudf::strings_column_view(result->view().column(0));
+  EXPECT_EQ(sv.size(), view.num_rows() * 2);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT32});
+  sliced = cudf::split(sv.parent(), {view.num_rows()});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[0], input);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[1], input);
+}

From 13af97572fa108073ce3f335802949106c9ad790 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 22 Apr 2024 09:50:01 -0400
Subject: [PATCH 110/272] Add cuda-sanitizer-api dependency for test-cpp matrix
 11.4 (#15573)

Fixes dependency issue with nightly builds running 11.4.3 cpp tests requiring the compute-sanitizer tool.

Closes #15571

Authors:
  - David Wendt (https://github.com/davidwendt)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15573
---
 dependencies.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/dependencies.yaml b/dependencies.yaml
index 2ed2525fc1e..14c698000cb 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -599,6 +599,10 @@ dependencies:
               cuda: "11.8"
             packages:
               - cuda-sanitizer-api=11.8.86
+          - matrix:
+              cuda: "11.4"
+            packages:
+              - cuda-sanitizer-api=11.4.120
           - matrix:  # Fallback for CUDA 11 or no matrix
             packages:
   test_java:

From 475f5e5fcb5b703ffdf1e491b7f2230c514a41fc Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Mon, 22 Apr 2024 11:18:44 -0400
Subject: [PATCH 111/272] Remove index name overrides in dask-cudf pyarrow
 table dispatch (#15514)

Looks like these overrides should be safe to remove now that https://github.com/rapidsai/cudf/issues/14159 is closed out.

This should unblock the GPU CI failures we're seeing on Dask with 24.06 in https://github.com/dask/dask/pull/11045.

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15514
---
 python/cudf/cudf/core/dataframe.py            | 46 ++++++++++++-------
 python/cudf/cudf/tests/test_dataframe.py      | 22 +++++++++
 python/dask_cudf/dask_cudf/backends.py        | 22 +--------
 .../dask_cudf/tests/test_dispatch.py          | 11 ++++-
 4 files changed, 62 insertions(+), 39 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 99e4588d608..7b7fc87a6dc 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5450,9 +5450,11 @@ def from_arrow(cls, table):
         """
         index_col = None
         col_index_names = None
+        physical_column_md = []
         if isinstance(table, pa.Table) and isinstance(
             table.schema.pandas_metadata, dict
         ):
+            physical_column_md = table.schema.pandas_metadata["columns"]
             index_col = table.schema.pandas_metadata["index_columns"]
             if "column_indexes" in table.schema.pandas_metadata:
                 col_index_names = []
@@ -5480,7 +5482,18 @@ def from_arrow(cls, table):
                     # https://github.com/apache/arrow/issues/15178
                     out = out.set_index(idx)
             else:
-                out = out.set_index(index_col[0])
+                out = out.set_index(index_col)
+
+        if (
+            "__index_level_0__" in out.index.names
+            and len(out.index.names) == 1
+        ):
+            real_index_name = None
+            for md in physical_column_md:
+                if md["field_name"] == "__index_level_0__":
+                    real_index_name = md["name"]
+                    break
+            out.index.name = real_index_name
 
         return out
 
@@ -5530,42 +5543,43 @@ def to_arrow(self, preserve_index=None):
         write_index = preserve_index is not False
         keep_range_index = write_index and preserve_index is None
         index = self.index
+        index_levels = [self.index]
         if write_index:
             if isinstance(index, cudf.RangeIndex) and keep_range_index:
-                descr = {
-                    "kind": "range",
-                    "name": index.name,
-                    "start": index._start,
-                    "stop": index._stop,
-                    "step": 1,
-                }
+                index_descr = [
+                    {
+                        "kind": "range",
+                        "name": index.name,
+                        "start": index._start,
+                        "stop": index._stop,
+                        "step": 1,
+                    }
+                ]
             else:
                 if isinstance(index, cudf.RangeIndex):
                     index = index._as_int_index()
                     index.name = "__index_level_0__"
                 if isinstance(index, MultiIndex):
-                    gen_names = tuple(
-                        f"level_{i}" for i, _ in enumerate(index._data.names)
-                    )
+                    index_descr = list(index._data.names)
+                    index_levels = index.levels
                 else:
-                    gen_names = (
+                    index_descr = (
                         index.names if index.name is not None else ("index",)
                     )
-                for gen_name, col_name in zip(gen_names, index._data.names):
+                for gen_name, col_name in zip(index_descr, index._data.names):
                     data._insert(
                         data.shape[1],
                         gen_name,
                         index._data[col_name],
                     )
-                descr = gen_names[0]
-            index_descr.append(descr)
 
         out = super(DataFrame, data).to_arrow()
+        # import pdb; pdb.set_trace()
         metadata = pa.pandas_compat.construct_metadata(
             columns_to_convert=[self[col] for col in self._data.names],
             df=self,
             column_names=out.schema.names,
-            index_levels=[index],
+            index_levels=index_levels,
             index_descriptors=index_descr,
             preserve_index=preserve_index,
             types=out.schema.types,
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index ead1ab2da6c..df0e22c5e43 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -2769,6 +2769,28 @@ def test_arrow_pandas_compat(pdf, gdf, preserve_index):
     assert_eq(pdf2, gdf2)
 
 
+@pytest.mark.parametrize(
+    "index", [None, cudf.RangeIndex(3, name="a"), "a", "b", ["a", "b"]]
+)
+@pytest.mark.parametrize("preserve_index", [True, False, None])
+def test_arrow_round_trip(preserve_index, index):
+    data = {"a": [4, 5, 6], "b": ["cat", "dog", "bird"]}
+    if isinstance(index, (list, str)):
+        gdf = cudf.DataFrame(data).set_index(index)
+    else:
+        gdf = cudf.DataFrame(data, index=index)
+
+    table = gdf.to_arrow(preserve_index=preserve_index)
+    table_pd = pa.Table.from_pandas(
+        gdf.to_pandas(), preserve_index=preserve_index
+    )
+
+    gdf_out = cudf.DataFrame.from_arrow(table)
+    pdf_out = table_pd.to_pandas()
+
+    assert_eq(gdf_out, pdf_out)
+
+
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"])
 def test_cuda_array_interface(dtype):
     np_data = np.arange(10).astype(dtype)
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 5401bcd3767..94528325aea 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -384,18 +384,6 @@ def _cudf_to_table(obj, preserve_index=None, **kwargs):
             "Ignoring the following arguments to "
             f"`to_pyarrow_table_dispatch`: {list(kwargs)}"
         )
-
-    # TODO: Remove this logic when cudf#14159 is resolved
-    # (see: https://github.com/rapidsai/cudf/issues/14159)
-    if preserve_index and isinstance(obj.index, cudf.RangeIndex):
-        obj = obj.copy()
-        obj.index.name = (
-            obj.index.name
-            if obj.index.name is not None
-            else "__index_level_0__"
-        )
-        obj.index = obj.index._as_int_index()
-
     return obj.to_arrow(preserve_index=preserve_index)
 
 
@@ -408,15 +396,7 @@ def _table_to_cudf(obj, table, self_destruct=None, **kwargs):
             f"Ignoring the following arguments to "
             f"`from_pyarrow_table_dispatch`: {list(kwargs)}"
         )
-    result = obj.from_arrow(table)
-
-    # TODO: Remove this logic when cudf#14159 is resolved
-    # (see: https://github.com/rapidsai/cudf/issues/14159)
-    if "__index_level_0__" in result.index.names:
-        assert len(result.index.names) == 1
-        result.index.name = None
-
-    return result
+    return obj.from_arrow(table)
 
 
 @union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex))
diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
index 76703206726..a12481a7bb4 100644
--- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py
+++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
@@ -25,17 +25,24 @@ def test_is_categorical_dispatch():
 
 
 @pytest.mark.parametrize("preserve_index", [True, False])
-def test_pyarrow_conversion_dispatch(preserve_index):
+@pytest.mark.parametrize("index", [None, cudf.RangeIndex(10, name="foo")])
+def test_pyarrow_conversion_dispatch(preserve_index, index):
     from dask.dataframe.dispatch import (
         from_pyarrow_table_dispatch,
         to_pyarrow_table_dispatch,
     )
 
-    df1 = cudf.DataFrame(np.random.randn(10, 3), columns=list("abc"))
+    df1 = cudf.DataFrame(
+        np.random.randn(10, 3), columns=list("abc"), index=index
+    )
     df2 = from_pyarrow_table_dispatch(
         df1, to_pyarrow_table_dispatch(df1, preserve_index=preserve_index)
     )
 
+    # preserve_index=False doesn't retain index metadata
+    if not preserve_index and index is not None:
+        df1.index.name = None
+
     assert type(df1) == type(df2)
     assert_eq(df1, df2)
 

From 818b29d2ee49a7cc6de910951f64c36c55cc6d08 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 22 Apr 2024 13:07:33 -1000
Subject: [PATCH 112/272] Clean up index methods (#15496)

- Removed `_index_from_columns` in favor of an inline call
- Renamed `_setdefault_name` to `_getdefault_name` and to not modify `kwargs`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15496
---
 python/cudf/cudf/core/groupby/groupby.py |  4 ++-
 python/cudf/cudf/core/index.py           | 31 +++++++++---------------
 python/cudf/cudf/core/indexed_frame.py   | 10 +++++---
 3 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index dd4924676f3..3e4b8192888 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1201,7 +1201,9 @@ def _grouped(self, *, include_groups: bool = True):
         offsets, grouped_key_cols, grouped_value_cols = self._groupby.groups(
             [*self.obj._index._columns, *self.obj._columns]
         )
-        grouped_keys = cudf.core.index._index_from_columns(grouped_key_cols)
+        grouped_keys = cudf.core.index._index_from_data(
+            dict(enumerate(grouped_key_cols))
+        )
         if isinstance(self.grouping.keys, cudf.MultiIndex):
             grouped_keys.names = self.grouping.keys.names
             to_drop = self.grouping.keys.names
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index bbe496333cd..6f08b1d83b3 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -149,13 +149,6 @@ def _index_from_data(data: MutableMapping, name: Any = no_default):
     return index_class_type._from_data(data, name)
 
 
-def _index_from_columns(
-    columns: List[cudf.core.column.ColumnBase], name: Any = no_default
-):
-    """Construct an index from ``columns``, with levels named 0, 1, 2..."""
-    return _index_from_data(dict(zip(range(len(columns)), columns)), name=name)
-
-
 class RangeIndex(BaseIndex, BinaryOperand):
     """
     Immutable Index implementing a monotonic integer range.
@@ -988,8 +981,7 @@ class Index(SingleColumnFrame, BaseIndex, metaclass=IndexMeta):
 
     @_cudf_nvtx_annotate
     def __init__(self, data, **kwargs):
-        kwargs = _setdefault_name(data, **kwargs)
-        name = kwargs.get("name")
+        name = _getdefault_name(data, name=kwargs.get("name"))
         super().__init__({name: data})
 
     @_cudf_nvtx_annotate
@@ -1397,8 +1389,7 @@ def __repr__(self):
     def __getitem__(self, index):
         res = self._get_elements_from_column(index)
         if isinstance(res, ColumnBase):
-            res = as_index(res)
-            res.name = self.name
+            res = as_index(res, name=self.name)
         return res
 
     @property  # type: ignore
@@ -1713,7 +1704,7 @@ def __init__(
         if dtype.kind != "M":
             raise TypeError("dtype must be a datetime type")
 
-        name = _setdefault_name(data, name=name)["name"]
+        name = _getdefault_name(data, name=name)
         data = column.as_column(data)
 
         # TODO: Remove this if statement and fix tests now that
@@ -2432,7 +2423,7 @@ def __init__(
         if dtype.kind != "m":
             raise TypeError("dtype must be a timedelta type")
 
-        name = _setdefault_name(data, name=name)["name"]
+        name = _getdefault_name(data, name=name)
         data = column.as_column(data, dtype=dtype)
 
         if copy:
@@ -2601,7 +2592,7 @@ def __init__(
                 )
         if copy:
             data = column.as_column(data, dtype=dtype).copy(deep=True)
-        kwargs = _setdefault_name(data, name=name)
+        name = _getdefault_name(data, name=name)
         if isinstance(data, CategoricalColumn):
             data = data
         elif isinstance(data, pd.Series) and (
@@ -2635,7 +2626,7 @@ def __init__(
             data = data.as_ordered(ordered=True)
         elif ordered is False and data.ordered is True:
             data = data.as_ordered(ordered=False)
-        super().__init__(data, **kwargs)
+        super().__init__(data, name=name)
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
@@ -2821,7 +2812,7 @@ def __init__(
         copy: bool = False,
         name=None,
     ):
-        name = _setdefault_name(data, name=name)["name"]
+        name = _getdefault_name(data, name=name)
 
         if dtype is not None:
             dtype = cudf.dtype(dtype)
@@ -3053,10 +3044,10 @@ def as_index(
     return idx
 
 
-def _setdefault_name(values, **kwargs):
-    if kwargs.get("name") is None:
-        kwargs["name"] = getattr(values, "name", None)
-    return kwargs
+def _getdefault_name(values, name):
+    if name is None:
+        return getattr(values, "name", None)
+    return name
 
 
 @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index c412b7a7e47..48e80d8162f 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -56,7 +56,7 @@
 from cudf.core.dtypes import ListDtype
 from cudf.core.frame import Frame
 from cudf.core.groupby.groupby import GroupBy
-from cudf.core.index import Index, RangeIndex, _index_from_columns
+from cudf.core.index import Index, RangeIndex, _index_from_data
 from cudf.core.missing import NA
 from cudf.core.multiindex import MultiIndex
 from cudf.core.resample import _Resampler
@@ -331,7 +331,9 @@ def _from_columns_like_self(
         if index_names is not None:
             n_index_columns = len(index_names)
             data_columns = columns[n_index_columns:]
-            index = _index_from_columns(columns[:n_index_columns])
+            index = _index_from_data(
+                dict(enumerate(columns[:n_index_columns]))
+            )
             if isinstance(index, cudf.MultiIndex):
                 index.names = index_names
             else:
@@ -4348,8 +4350,8 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""):
             index_names,
         ) = self._index._split_columns_by_levels(level)
         if index_columns:
-            index = _index_from_columns(
-                index_columns,
+            index = _index_from_data(
+                dict(enumerate(index_columns)),
                 name=self._index.name,
             )
             if isinstance(index, MultiIndex):

From 7804ba7f817b3fccf13b0084e2d7e0ac2257ff5a Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 22 Apr 2024 18:09:58 -1000
Subject: [PATCH 113/272] Preserve RangeIndex.step in to_arrow/from_arrow
 (#15581)

Noticed that step was hardcoded to `1` when it should reflect `RangeIndex.step`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15581
---
 python/cudf/cudf/core/dataframe.py       | 15 ++++++++-------
 python/cudf/cudf/tests/test_dataframe.py | 10 +++++++++-
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 7b7fc87a6dc..45bb66d5d4b 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5466,10 +5466,12 @@ def from_arrow(cls, table):
             out._data._level_names = col_index_names
         if index_col:
             if isinstance(index_col[0], dict):
+                range_meta = index_col[0]
                 idx = cudf.RangeIndex(
-                    index_col[0]["start"],
-                    index_col[0]["stop"],
-                    name=index_col[0]["name"],
+                    start=range_meta["start"],
+                    stop=range_meta["stop"],
+                    step=range_meta["step"],
+                    name=range_meta["name"],
                 )
                 if len(idx) == len(out):
                     # `idx` is generated from arrow `pandas_metadata`
@@ -5550,9 +5552,9 @@ def to_arrow(self, preserve_index=None):
                     {
                         "kind": "range",
                         "name": index.name,
-                        "start": index._start,
-                        "stop": index._stop,
-                        "step": 1,
+                        "start": index.start,
+                        "stop": index.stop,
+                        "step": index.step,
                     }
                 ]
             else:
@@ -5574,7 +5576,6 @@ def to_arrow(self, preserve_index=None):
                     )
 
         out = super(DataFrame, data).to_arrow()
-        # import pdb; pdb.set_trace()
         metadata = pa.pandas_compat.construct_metadata(
             columns_to_convert=[self[col] for col in self._data.names],
             df=self,
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index df0e22c5e43..59e8b41e51a 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -2770,7 +2770,15 @@ def test_arrow_pandas_compat(pdf, gdf, preserve_index):
 
 
 @pytest.mark.parametrize(
-    "index", [None, cudf.RangeIndex(3, name="a"), "a", "b", ["a", "b"]]
+    "index",
+    [
+        None,
+        cudf.RangeIndex(3, name="a"),
+        "a",
+        "b",
+        ["a", "b"],
+        cudf.RangeIndex(0, 5, 2, name="a"),
+    ],
 )
 @pytest.mark.parametrize("preserve_index", [True, False, None])
 def test_arrow_round_trip(preserve_index, index):

From 73306f1750f4859f03611658c4d11e2e1c82eb86 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 23 Apr 2024 10:31:13 -0400
Subject: [PATCH 114/272] Large strings support for cudf::fill (#15555)

Updates the `cudf::fill` strings specialization logic to use gather-based `make_strings_column` instead of the `make_strings_children` since the gather-based function already efficiently supports longs.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/15555
---
 cpp/src/strings/filling/fill.cu | 66 ++++++++++++---------------------
 1 file changed, 24 insertions(+), 42 deletions(-)

diff --git a/cpp/src/strings/filling/fill.cu b/cpp/src/strings/filling/fill.cu
index b48d56a595c..878d0fe11ba 100644
--- a/cpp/src/strings/filling/fill.cu
+++ b/cpp/src/strings/filling/fill.cu
@@ -15,10 +15,8 @@
  */
 
 #include <cudf/column/column_device_view.cuh>
-#include <cudf/detail/valid_if.cuh>
 #include <cudf/strings/detail/fill.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
-#include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
@@ -27,35 +25,34 @@
 #include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace strings {
 namespace detail {
 namespace {
+
 struct fill_fn {
   column_device_view const d_strings;
   size_type const begin;
   size_type const end;
-  string_view const d_value;
-  size_type* d_offsets{};
-  char* d_chars{};
-
-  __device__ string_view resolve_string_at(size_type idx) const
-  {
-    if ((begin <= idx) && (idx < end)) { return d_value; }
-    return d_strings.is_valid(idx) ? d_strings.element<string_view>(idx) : string_view{};
-  }
+  string_scalar_device_view const d_value;
 
-  __device__ void operator()(size_type idx) const
+  __device__ string_index_pair operator()(size_type idx) const
   {
-    auto const d_str = resolve_string_at(idx);
-    if (!d_chars) {
-      d_offsets[idx] = d_str.size_bytes();
+    auto d_str = string_view();
+    if ((begin <= idx) && (idx < end)) {
+      if (!d_value.is_valid()) { return string_index_pair{nullptr, 0}; }
+      d_str = d_value.value();
     } else {
-      copy_string(d_chars + d_offsets[idx], d_str);
+      if (d_strings.is_null(idx)) { return string_index_pair{nullptr, 0}; }
+      d_str = d_strings.element<string_view>(idx);
     }
+    return !d_str.empty() ? string_index_pair{d_str.data(), d_str.size_bytes()}
+                          : string_index_pair{"", 0};
   }
 };
+
 }  // namespace
 
 std::unique_ptr<column> fill(strings_column_view const& input,
@@ -72,33 +69,18 @@ std::unique_ptr<column> fill(strings_column_view const& input,
   CUDF_EXPECTS(begin <= end, "Parameters [begin,end) have invalid range values");
   if (begin == end) { return std::make_unique<column>(input.parent(), stream, mr); }
 
-  auto strings_column  = column_device_view::create(input.parent(), stream);
-  auto const d_strings = *strings_column;
-  auto const is_valid  = value.is_valid(stream);
-
-  // create resulting null mask
-  auto [null_mask, null_count] = [begin, end, is_valid, d_strings, stream, mr] {
-    if (begin == 0 and end == d_strings.size() and is_valid) {
-      return std::pair(rmm::device_buffer{}, 0);
-    }
-    return cudf::detail::valid_if(
-      thrust::make_counting_iterator<size_type>(0),
-      thrust::make_counting_iterator<size_type>(d_strings.size()),
-      [d_strings, begin, end, is_valid] __device__(size_type idx) {
-        return ((begin <= idx) && (idx < end)) ? is_valid : d_strings.is_valid(idx);
-      },
-      stream,
-      mr);
-  }();
-
-  auto const d_value = const_cast<string_scalar&>(value);
-  auto const d_str   = is_valid ? d_value.value(stream) : string_view{};
-  auto fn            = fill_fn{d_strings, begin, end, d_str};
+  auto const d_strings = column_device_view::create(input.parent(), stream);
+  auto const d_value   = cudf::get_scalar_device_view(const_cast<string_scalar&>(value));
 
-  auto [offsets_column, chars] = make_strings_children(fn, strings_count, stream, mr);
+  auto fn = fill_fn{*d_strings, begin, end, d_value};
+  rmm::device_uvector<string_index_pair> indices(strings_count, stream);
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::counting_iterator<size_type>(0),
+                    thrust::counting_iterator<size_type>(strings_count),
+                    indices.begin(),
+                    fn);
 
-  return make_strings_column(
-    strings_count, std::move(offsets_column), chars.release(), null_count, std::move(null_mask));
+  return make_strings_column(indices.begin(), indices.end(), stream, mr);
 }
 
 }  // namespace detail

From 7341866495b03bdf3f01f8f4e57953741c77e7aa Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 23 Apr 2024 15:03:38 -0400
Subject: [PATCH 115/272] Remove public gtest dependency from libcudf conda
 package (#15534)

Reworks the cudftestutil and dependency chain to remove the public gtest dependency in libcudf conda package.
The libcudftestutil was previously made static due to issues using a static system GTest that wasn't build with `fPIC`. Using  a GTest from `rapids-cmake` which is built with `fPIC` enabled, removes this restriction and allows us to remove the public depedency.

Some notes:
-  We need to align all of RAPIDS C++ projects on static GTest from `rapids-cmake`
- None of the compiled components / classes of `libcudftestutils` publically depend on GTest
- Two of the libcudftestutils header only components bring include gtest. Since these headers aren't required to be used we are going to consider them optional.
- Therefore using these optional `libcudftestutils` header will require downstream users to bring in GTest.

Fixes #13381

Authors:
  - Robert Maynard (https://github.com/robertmaynard)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cudf/pull/15534
---
 .../all_cuda-118_arch-x86_64.yaml             |  3 -
 .../all_cuda-122_arch-x86_64.yaml             |  3 -
 conda/recipes/libcudf/conda_build_config.yaml |  6 --
 conda/recipes/libcudf/meta.yaml               | 11 ---
 cpp/CMakeLists.txt                            | 12 ++-
 cpp/benchmarks/CMakeLists.txt                 |  2 +-
 cpp/cmake/thirdparty/get_gtest.cmake          | 19 +----
 cpp/include/cudf_test/column_wrapper.hpp      |  1 -
 cpp/include/cudf_test/cudf_gtest.hpp          | 82 +------------------
 cpp/tests/CMakeLists.txt                      |  5 +-
 cpp/tests/groupby/groupby_test_util.cpp       |  3 +-
 .../{base_fixture.cpp => random_seed.cpp}     |  0
 dependencies.yaml                             |  6 --
 13 files changed, 14 insertions(+), 139 deletions(-)
 rename cpp/tests/utilities/{base_fixture.cpp => random_seed.cpp} (100%)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index e8816da3a2a..7a5fef9f25e 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -8,7 +8,6 @@ channels:
 - nvidia
 dependencies:
 - aiobotocore>=2.2.0
-- benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
 - breathe>=4.35.0
@@ -34,8 +33,6 @@ dependencies:
 - fmt>=10.1.1,<11
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
-- gmock>=1.13.0
-- gtest>=1.13.0
 - hypothesis
 - identify>=2.5.20
 - ipython
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 8044fc35a19..48453e18bb0 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -8,7 +8,6 @@ channels:
 - nvidia
 dependencies:
 - aiobotocore>=2.2.0
-- benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
 - breathe>=4.35.0
@@ -35,8 +34,6 @@ dependencies:
 - fmt>=10.1.1,<11
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
-- gmock>=1.13.0
-- gtest>=1.13.0
 - hypothesis
 - identify>=2.5.20
 - ipython
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 53770956ebe..b7fbaab9306 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -16,12 +16,6 @@ sysroot_version:
 cmake_version:
   - ">=3.26.4"
 
-gbench_version:
-  - "==1.8.0"
-
-gtest_version:
-  - ">=1.13.0"
-
 libarrow_version:
   - "==14.0.2"
 
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 3af0b7885c3..695c515b9d4 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -69,9 +69,6 @@ requirements:
     - librdkafka {{ librdkafka_version }}
     - fmt {{ fmt_version }}
     - spdlog {{ spdlog_version }}
-    - benchmark {{ gbench_version }}
-    - gtest {{ gtest_version }}
-    - gmock {{ gtest_version }}
     - zlib {{ zlib_version }}
 
 outputs:
@@ -108,8 +105,6 @@ outputs:
         - librmm ={{ minor_version }}
         - libkvikio ={{ minor_version }}
         - dlpack {{ dlpack_version }}
-        - gtest {{ gtest_version }}
-        - gmock {{ gtest_version }}
     test:
       commands:
         - test -f $PREFIX/lib/libcudf.so
@@ -221,9 +216,6 @@ outputs:
         {% else %}
         - libcurand-dev
         {% endif %}
-        - benchmark {{ gbench_version }}
-        - gtest {{ gtest_version }}
-        - gmock {{ gtest_version }}
       run:
         - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
         - {{ pin_subpackage('libcudf', exact=True) }}
@@ -233,9 +225,6 @@ outputs:
         {% else %}
         - libcurand
         {% endif %}
-        - benchmark {{ gbench_version }}
-        - gtest {{ gtest_version }}
-        - gmock {{ gtest_version }}
     about:
       home: https://rapids.ai/
       license: Apache-2.0
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 60d0094efac..b6a61368fe7 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -847,14 +847,12 @@ if(CUDF_BUILD_TESTUTIL)
 
   add_library(cudf::cudftest_default_stream ALIAS cudftest_default_stream)
 
-  # Needs to be static so that we support usage of static builds of gtest which doesn't compile with
-  # fPIC enabled and therefore can't be embedded into shared libraries.
   add_library(
-    cudftestutil STATIC
+    cudftestutil SHARED
     tests/io/metadata_utilities.cpp
-    tests/utilities/base_fixture.cpp
     tests/utilities/column_utilities.cu
     tests/utilities/debug_utilities.cu
+    tests/utilities/random_seed.cpp
     tests/utilities/table_utilities.cu
     tests/utilities/tdigest_utilities.cu
   )
@@ -879,8 +877,8 @@ if(CUDF_BUILD_TESTUTIL)
 
   target_link_libraries(
     cudftestutil
-    PUBLIC GTest::gmock GTest::gtest Threads::Threads cudf cudftest_default_stream
-    PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
+    PUBLIC Threads::Threads cudf cudftest_default_stream
+    PRIVATE GTest::gmock GTest::gtest $<TARGET_NAME_IF_EXISTS:conda_env>
   )
 
   target_include_directories(
@@ -959,7 +957,7 @@ endif()
 if(CUDF_BUILD_BENCHMARKS)
   # Find or install GoogleBench
   include(${rapids-cmake-dir}/cpm/gbench.cmake)
-  rapids_cpm_gbench()
+  rapids_cpm_gbench(BUILD_STATIC)
 
   # Find or install nvbench
   include(cmake/thirdparty/get_nvbench.cmake)
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index d36ecfd3a21..5fd328dfc68 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -40,7 +40,7 @@ target_include_directories(
 
 # Use an OBJECT library so we only compile these helper source files only once
 add_library(
-  cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/base_fixture.cpp"
+  cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp"
                                synchronization/synchronization.cpp io/cuio_common.cpp
 )
 target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $<TARGET_NAME_IF_EXISTS:conda_env>)
diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
index cfb219448f1..10e6b026d9a 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -17,22 +17,7 @@ function(find_and_configure_gtest)
   include(${rapids-cmake-dir}/cpm/gtest.cmake)
 
   # Find or install GoogleTest
-  rapids_cpm_gtest(BUILD_EXPORT_SET cudf-testing-exports INSTALL_EXPORT_SET cudf-testing-exports)
-
-  if(GTest_ADDED)
-    rapids_export(
-      BUILD GTest
-      VERSION ${GTest_VERSION}
-      EXPORT_SET GTestTargets
-      GLOBAL_TARGETS gtest gmock gtest_main gmock_main
-      NAMESPACE GTest::
-    )
-
-    include("${rapids-cmake-dir}/export/find_package_root.cmake")
-    rapids_export_find_package_root(
-      BUILD GTest [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-testing-exports
-    )
-  endif()
+  rapids_cpm_gtest(BUILD_STATIC)
 
 endfunction()
 
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 151fe50be4f..dc873658abf 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cudf_test/column_utilities.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/default_stream.hpp>
 
 #include <cudf/column/column.hpp>
diff --git a/cpp/include/cudf_test/cudf_gtest.hpp b/cpp/include/cudf_test/cudf_gtest.hpp
index fa76204d622..89394fbd1c3 100644
--- a/cpp/include/cudf_test/cudf_gtest.hpp
+++ b/cpp/include/cudf_test/cudf_gtest.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,86 +16,6 @@
 
 #pragma once
 
-#ifdef GTEST_INCLUDE_GTEST_GTEST_H_
-#error "Don't include gtest/gtest.h directly, include cudf_gtest.hpp instead"
-#endif
-
-/**
- * @file cudf_gtest.hpp
- * @brief Work around for GTests( <=v1.10 ) emulation of variadic templates in
- * @verbatim ::Testing::Types @endverbatim
- *
- * @note Instead of including `gtest/gtest.h`, all libcudf test files should
- * include `cudf_gtest.hpp` instead.
- *
- * Removes the 50 type limit in a type-parameterized test list.
- *
- * Uses macros to rename GTests's emulated variadic template types and then
- * redefines them properly.
- */
-
-// @cond
-#if __has_include(<gtest/internal/gtest-type-util.h.pump>)
-// gtest doesn't provide a version header so we need to
-// use a file existence trick.
-// gtest-type-util.h.pump only exists in versions < 1.11
-#define Types      Types_NOT_USED
-#define Types0     Types0_NOT_USED
-#define TypeList   TypeList_NOT_USED
-#define Templates  Templates_NOT_USED
-#define Templates0 Templates0_NOT_USED
-#include <gtest/internal/gtest-type-util.h>
-#undef Types
-#undef Types0
-#undef TypeList
-#undef Templates
-#undef Templates0
-
-namespace testing {
-template <class... TYPES>
-struct Types {
-  using type = Types;
-};
-
-template <class T, class... TYPES>
-struct Types<T, TYPES...> {
-  using Head = T;
-  using Tail = Types<TYPES...>;
-
-  using type = Types;
-};
-
-namespace internal {
-using Types0 = Types<>;
-
-template <GTEST_TEMPLATE_... TYPES>
-struct Templates {};
-
-template <GTEST_TEMPLATE_ HEAD, GTEST_TEMPLATE_... TAIL>
-struct Templates<HEAD, TAIL...> {
-  using Head = internal::TemplateSel<HEAD>;
-  using Tail = Templates<TAIL...>;
-
-  using type = Templates;
-};
-
-using Templates0 = Templates<>;
-
-template <typename T>
-struct TypeList {
-  using type = Types<T>;
-};
-
-template <class... TYPES>
-struct TypeList<Types<TYPES...>> {
-  using type = Types<TYPES...>;
-};
-
-}  // namespace internal
-}  // namespace testing
-#endif  // gtest < 1.11
-// @endcond
-
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 053fcc0989a..d0c2b3d2bce 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -55,8 +55,9 @@ function(ConfigureTest CMAKE_TEST_NAME)
   )
 
   target_link_libraries(
-    ${CMAKE_TEST_NAME} PRIVATE cudftestutil GTest::gmock_main GTest::gtest_main nvtx3-cpp
-                               $<TARGET_NAME_IF_EXISTS:conda_env> "${_CUDF_TEST_EXTRA_LIB}"
+    ${CMAKE_TEST_NAME}
+    PRIVATE cudftestutil GTest::gmock GTest::gmock_main GTest::gtest GTest::gtest_main nvtx3-cpp
+            $<TARGET_NAME_IF_EXISTS:conda_env> "${_CUDF_TEST_EXTRA_LIB}"
   )
   rapids_cuda_set_runtime(${CMAKE_TEST_NAME} USE_STATIC ${CUDA_STATIC_RUNTIME})
   rapids_test_add(
diff --git a/cpp/tests/groupby/groupby_test_util.cpp b/cpp/tests/groupby/groupby_test_util.cpp
index de51012e8e1..8bd109fca53 100644
--- a/cpp/tests/groupby/groupby_test_util.cpp
+++ b/cpp/tests/groupby/groupby_test_util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/column/column_view.hpp>
diff --git a/cpp/tests/utilities/base_fixture.cpp b/cpp/tests/utilities/random_seed.cpp
similarity index 100%
rename from cpp/tests/utilities/base_fixture.cpp
rename to cpp/tests/utilities/random_seed.cpp
diff --git a/dependencies.yaml b/dependencies.yaml
index 14c698000cb..de5b3569933 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -247,9 +247,6 @@ dependencies:
       - output_types: conda
         packages:
           - fmt>=10.1.1,<11
-          - &gbench benchmark==1.8.0
-          - &gtest gtest>=1.13.0
-          - &gmock gmock>=1.13.0
           - librmm==24.6.*
           - libkvikio==24.6.*
           - librdkafka>=1.9.0,<1.10.0a0
@@ -585,9 +582,6 @@ dependencies:
       - output_types: conda
         packages:
           - *cmake_ver
-          - *gbench
-          - *gtest
-          - *gmock
     specific:
       - output_types: conda
         matrices:

From 702706d7c2e86e900ffbca0568d6ff9d2e415975 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 23 Apr 2024 15:37:49 -0400
Subject: [PATCH 116/272] Add experimental make_strings_children utility
 (#15363)

Adds new `cudf::strings::detail::experimental::make_strings_children` which uses the offsetalator to build output columns. The current `d_offsets` member required by the given functors no longer stores sizes and offsets but is now split into `d_sizes` and `d_offsets` where `d_sizes` is computed in the first pass and then `d_offsets` is set to an offsetalator for building output in `d_chars`.

Once all the uses of `make_strings_children` (~50 or so) are converted to use the experimental implementation, this will replace the old implementation and the 'experimental' namespace will be removed.

This PR includes 2 changes, `repeat_strings` and `concatenate` (per row) since each use different overloaded `make_strings_children` functions to verify the code does not break any current tests.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)
  - Shruti Shivakumar (https://github.com/shrshi)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15363
---
 .../strings/detail/strings_children_ex.cuh    | 186 ++++++++++++++++++
 cpp/src/strings/combine/concatenate.cu        |  21 +-
 cpp/src/strings/repeat_strings.cu             |  58 +++---
 3 files changed, 232 insertions(+), 33 deletions(-)
 create mode 100644 cpp/include/cudf/strings/detail/strings_children_ex.cuh

diff --git a/cpp/include/cudf/strings/detail/strings_children_ex.cuh b/cpp/include/cudf/strings/detail/strings_children_ex.cuh
new file mode 100644
index 00000000000..6028c7e2437
--- /dev/null
+++ b/cpp/include/cudf/strings/detail/strings_children_ex.cuh
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+namespace experimental {
+
+/**
+ * @brief Kernel used by make_strings_children for calling the given functor
+ *
+ * @tparam SizeAndExecuteFunction Functor type to call in each thread
+ *
+ * @param fn Functor to call in each thread
+ * @param exec_size Total number of threads to be processed by this kernel
+ */
+template <typename SizeAndExecuteFunction>
+CUDF_KERNEL void strings_children_kernel(SizeAndExecuteFunction fn, size_type exec_size)
+{
+  auto tid = cudf::detail::grid_1d::global_thread_id();
+  if (tid < exec_size) { fn(tid); }
+}
+
+/**
+ * @brief Creates child offsets and chars data by applying the template function that
+ * can be used for computing the output size of each string as well as create the output
+ *
+ * The `size_and_exec_fn` is expected declare an operator() function with a size_type parameter
+ * and 3 member variables:
+ * - `d_sizes`: output size in bytes of each output row for the 1st pass call
+ * - `d_chars`: output buffer for new string data for the 2nd pass call
+ * - `d_offsets`: used for addressing the specific output row data in `d_chars`
+ *
+ * The 1st pass call computes the output sizes and is identified by `d_chars==nullptr`.
+ * Null rows should be set with an output size of 0.
+ *
+ * @code{.cpp}
+ * struct size_and_exec_fn {
+ *  size_type* d_sizes;
+ *  char* d_chars;
+ *  input_offsetalator d_offsets;
+ *
+ *   __device__ void operator()(size_type thread_idx)
+ *   {
+ *     // functor-specific logic to resolve out_idx from thread_idx
+ *     if( !d_chars ) {
+ *       d_sizes[out_idx] = output_size;
+ *     } else {
+ *       auto d_output = d_chars + d_offsets[out_idx];
+ *       // write characters to d_output
+ *     }
+ *   }
+ * };
+ * @endcode
+ *
+ * @tparam SizeAndExecuteFunction Functor type with an operator() function accepting
+ *         an index parameter and three member variables: `size_type* d_sizes`
+ *         `char* d_chars`, and `input_offsetalator d_offsets`.
+ *
+ * @param size_and_exec_fn This is called twice. Once for the output size of each string
+ *        and once again to fill in the memory pointed to by d_chars.
+ * @param exec_size Number of threads for executing the `size_and_exec_fn` function
+ * @param strings_count Number of strings
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned columns' device memory
+ * @return Offsets child column and chars vector for creating a strings column
+ */
+template <typename SizeAndExecuteFunction>
+auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
+                           size_type exec_size,
+                           size_type strings_count,
+                           rmm::cuda_stream_view stream,
+                           rmm::device_async_resource_ref mr)
+{
+  // This is called twice -- once for computing sizes and once for writing chars.
+  // Reducing the number of places size_and_exec_fn is inlined speeds up compile time.
+  auto for_each_fn = [exec_size, stream](SizeAndExecuteFunction& size_and_exec_fn) {
+    auto constexpr block_size = 256;
+    auto grid                 = cudf::detail::grid_1d{exec_size, block_size};
+    strings_children_kernel<<<grid.num_blocks, block_size, 0, stream.value()>>>(size_and_exec_fn,
+                                                                                exec_size);
+  };
+
+  // Compute the output sizes
+  auto output_sizes        = rmm::device_uvector<size_type>(strings_count, stream);
+  size_and_exec_fn.d_sizes = output_sizes.data();
+  size_and_exec_fn.d_chars = nullptr;
+  for_each_fn(size_and_exec_fn);
+
+  // Convert the sizes to offsets
+  auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
+    output_sizes.begin(), output_sizes.end(), stream, mr);
+  size_and_exec_fn.d_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
+
+  // Now build the chars column
+  rmm::device_uvector<char> chars(bytes, stream, mr);
+  size_and_exec_fn.d_chars = chars.data();
+
+  // Execute the function fn again to fill in the chars data.
+  if (bytes > 0) { for_each_fn(size_and_exec_fn); }
+
+  return std::pair(std::move(offsets_column), std::move(chars));
+}
+
+/**
+ * @brief Creates child offsets and chars columns by applying the template function that
+ * can be used for computing the output size of each string as well as create the output
+ *
+ * The `size_and_exec_fn` is expected declare an operator() function with a size_type parameter
+ * and 3 member variables:
+ * - `d_sizes`: output size in bytes of each output row for the 1st pass call
+ * - `d_chars`: output buffer for new string data for the 2nd pass call
+ * - `d_offsets`: used for addressing the specific output row data in `d_chars`
+ *
+ * The 1st pass call computes the output sizes and is identified by `d_chars==nullptr`.
+ * Null rows should be set with an output size of 0.
+ *
+ * @code{.cpp}
+ * struct size_and_exec_fn {
+ *  size_type* d_sizes;
+ *  char* d_chars;
+ *  input_offsetalator d_offsets;
+ *
+ *   __device__ void operator()(size_type idx)
+ *   {
+ *     if( !d_chars ) {
+ *       d_sizes[idx] = output_size;
+ *     } else {
+ *       auto d_output = d_chars + d_offsets[idx];
+ *       // write characters to d_output
+ *     }
+ *   }
+ * };
+ * @endcode
+ *
+ * @tparam SizeAndExecuteFunction Functor type with an operator() function accepting
+ *         an index parameter and three member variables: `size_type* d_sizes`
+ *         `char* d_chars`, and `input_offsetalator d_offsets`.
+ *
+ * @param size_and_exec_fn This is called twice. Once for the output size of each string
+ *        and once again to fill in the memory pointed to by `d_chars`.
+ * @param strings_count Number of strings
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned columns' device memory
+ * @return Offsets child column and chars vector for creating a strings column
+ */
+template <typename SizeAndExecuteFunction>
+auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
+                           size_type strings_count,
+                           rmm::cuda_stream_view stream,
+                           rmm::device_async_resource_ref mr)
+{
+  return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr);
+}
+
+}  // namespace experimental
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu
index 33d2de3cd07..97008fa94f8 100644
--- a/cpp/src/strings/combine/concatenate.cu
+++ b/cpp/src/strings/combine/concatenate.cu
@@ -17,11 +17,12 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/offsets_iterator.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/detail/combine.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -49,8 +50,9 @@ struct concat_strings_base {
   table_device_view const d_table;
   string_scalar_device_view const d_narep;
   separator_on_nulls separate_nulls;
-  size_type* d_offsets{};
-  char* d_chars{};
+  size_type* d_sizes;
+  char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Concatenate each table row to a single output string.
@@ -68,7 +70,7 @@ struct concat_strings_base {
         thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) {
           return col.is_null(idx);
         })) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -95,7 +97,7 @@ struct concat_strings_base {
         write_separator || (separate_nulls == separator_on_nulls::YES) || !null_element;
     }
 
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -113,7 +115,7 @@ struct concat_strings_fn : concat_strings_base {
   {
   }
 
-  __device__ void operator()(size_type idx) { process_row(idx, d_separator); }
+  __device__ void operator()(std::size_t idx) { process_row(idx, d_separator); }
 };
 
 }  // namespace
@@ -143,7 +145,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
   // Create device views from the strings columns.
   auto d_table = table_device_view::create(strings_columns, stream);
   concat_strings_fn fn{*d_table, d_separator, d_narep, separate_nulls};
-  auto [offsets_column, chars] = make_strings_children(fn, strings_count, stream, mr);
+  auto [offsets_column, chars] = experimental::make_strings_children(fn, strings_count, stream, mr);
 
   // create resulting null mask
   auto [null_mask, null_count] = cudf::detail::valid_if(
@@ -188,7 +190,7 @@ struct multi_separator_concat_fn : concat_strings_base {
   __device__ void operator()(size_type idx)
   {
     if (d_separators.is_null(idx) && !d_separator_narep.is_valid()) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -235,7 +237,8 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
 
   multi_separator_concat_fn mscf{
     *d_table, separator_col_view, separator_rep, col_rep, separate_nulls};
-  auto [offsets_column, chars] = make_strings_children(mscf, strings_count, stream, mr);
+  auto [offsets_column, chars] =
+    experimental::make_strings_children(mscf, strings_count, stream, mr);
 
   // Create resulting null mask
   auto [null_mask, null_count] = cudf::detail::valid_if(
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index 97168a7fbd7..de1d5e38e00 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -20,7 +20,8 @@
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/detail/offsets_iterator.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -107,22 +108,26 @@ struct compute_size_and_repeat_fn {
   column_device_view const strings_dv;
   size_type const repeat_times;
   bool const has_nulls;
-
-  size_type* d_offsets{nullptr};
-
-  // If d_chars == nullptr: only compute sizes of the output strings.
-  // If d_chars != nullptr: only repeat strings.
-  char* d_chars{nullptr};
-
-  // `idx` will be in the range of [0, repeat_times * strings_count).
-  __device__ void operator()(size_type const idx) const noexcept
+  size_type* d_sizes;
+  char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
+
+  /**
+   * @brief Called by make_strings_children to build output
+   *
+   * @param idx Thread index in the range [0,repeat_times * strings_count)
+   * @param d_sizes Return output size here in 1st call (d_chars==nullptr)
+   * @param d_chars Write output here in 2nd call
+   * @param d_offsets Offsets to address output row within d_chars
+   */
+  __device__ void operator()(size_type idx) const noexcept
   {
     auto const str_idx    = idx / repeat_times;  // value cycles in [0, string_count)
     auto const repeat_idx = idx % repeat_times;  // value cycles in [0, repeat_times)
     auto const is_valid   = !has_nulls || strings_dv.is_valid_nocheck(str_idx);
 
     if (!d_chars && repeat_idx == 0) {
-      d_offsets[str_idx] =
+      d_sizes[str_idx] =
         is_valid ? repeat_times * strings_dv.element<string_view>(str_idx).size_bytes() : 0;
     }
 
@@ -161,8 +166,8 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
   auto const strings_dv_ptr = column_device_view::create(input.parent(), stream);
   auto const fn = compute_size_and_repeat_fn{*strings_dv_ptr, repeat_times, input.has_nulls()};
 
-  auto [offsets_column, chars] =
-    make_strings_children(fn, strings_count * repeat_times, strings_count, stream, mr);
+  auto [offsets_column, chars] = experimental::make_strings_children(
+    fn, strings_count * repeat_times, strings_count, stream, mr);
   return make_strings_column(strings_count,
                              std::move(offsets_column),
                              chars.release(),
@@ -182,14 +187,19 @@ struct compute_sizes_and_repeat_fn {
   Iterator const repeat_times_iter;
   bool const strings_has_nulls;
   bool const rtimes_has_nulls;
-
-  size_type* d_offsets{nullptr};
-
-  // If d_chars == nullptr: only compute sizes of the output strings.
-  // If d_chars != nullptr: only repeat strings.
-  char* d_chars{nullptr};
-
-  __device__ void operator()(size_type const idx) const noexcept
+  size_type* d_sizes;
+  char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
+
+  /**
+   * @brief Called by make_strings_children to build output
+   *
+   * @param idx Row index
+   * @param d_sizes Return output size here in 1st call (d_chars==nullptr)
+   * @param d_chars Write output here in 2nd call
+   * @param d_offsets Offsets to address output row within d_chars
+   */
+  __device__ void operator()(size_type idx) const noexcept
   {
     auto const string_is_valid = !strings_has_nulls || strings_dv.is_valid_nocheck(idx);
     auto const rtimes_is_valid = !rtimes_has_nulls || repeat_times_dv.is_valid_nocheck(idx);
@@ -197,7 +207,7 @@ struct compute_sizes_and_repeat_fn {
     // Any null input (either string or repeat_times value) will result in a null output.
     auto const is_valid = string_is_valid && rtimes_is_valid;
     if (!is_valid) {
-      if (!d_chars) { d_offsets[idx] = 0; }
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -206,7 +216,7 @@ struct compute_sizes_and_repeat_fn {
 
     if (!d_chars) {
       // repeat_times could be negative
-      d_offsets[idx] = (repeat_times > 0) ? (repeat_times * d_str.size_bytes()) : 0;
+      d_sizes[idx] = std::max(repeat_times, 0) * d_str.size_bytes();
     } else {
       auto output_ptr = d_chars + d_offsets[idx];
       while (repeat_times-- > 0) {
@@ -241,7 +251,7 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                                              input.has_nulls(),
                                                              repeat_times.has_nulls()};
 
-  auto [offsets_column, chars] = make_strings_children(fn, strings_count, stream, mr);
+  auto [offsets_column, chars] = experimental::make_strings_children(fn, strings_count, stream, mr);
 
   // We generate new bitmask by AND of the two input columns' bitmasks.
   // Note that if either of the input columns are nullable, the output column will also be nullable

From b16e5c25eb7c38b26cb0d5b1e96047f0ef968c2b Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 23 Apr 2024 15:43:48 -0400
Subject: [PATCH 117/272] Disable compute-sanitizer usage in CI tests with
 CUDA<11.6 (#15584)

Undoes changes in 15573 since `compute-sanitizer` is not available in the CI test environment with CUDA 11.4.
This disables the example scripts for tests with CUDA < 11.6 only to unblock the nightly builds.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cudf/pull/15584
---
 ci/run_cudf_examples.sh | 6 ++++++
 dependencies.yaml       | 4 ----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh
index 71af6446748..f3561bc595c 100755
--- a/ci/run_cudf_examples.sh
+++ b/ci/run_cudf_examples.sh
@@ -9,6 +9,12 @@ trap "EXITCODE=1" ERR
 # Support customizing the examples' install location
 cd "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/examples/libcudf/";
 
+# compute-sanitizer not available before CUDA 11.6
+if [[ "${RAPIDS_CUDA_VERSION%.*}" < "11.6" ]]; then
+  echo "computer-sanitizer unavailable pre 11.6"
+  exit 0
+fi
+
 compute-sanitizer --tool memcheck basic_example
 
 compute-sanitizer --tool memcheck deduplication
diff --git a/dependencies.yaml b/dependencies.yaml
index de5b3569933..1508656471d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -593,10 +593,6 @@ dependencies:
               cuda: "11.8"
             packages:
               - cuda-sanitizer-api=11.8.86
-          - matrix:
-              cuda: "11.4"
-            packages:
-              - cuda-sanitizer-api=11.4.120
           - matrix:  # Fallback for CUDA 11 or no matrix
             packages:
   test_java:

From e6d9b9f141ce675313c00aac20194e70bcf52b0b Mon Sep 17 00:00:00 2001
From: Allison Piper <apiper@nvidia.com>
Date: Tue, 23 Apr 2024 17:11:43 -0400
Subject: [PATCH 118/272] Update NVBench fixture to use new hooks, fix pinned
 memory segfault. (#15492)

NVBench recently exposed new hooks for modifying its `main` implementation. Updated cudf to use these.

Also noticed that the host pinned-pool memory resource option caused the test to segfault, since the function-scope static holding the pool outlived the CUDA context. Refactored the fixture a bit to ensure that the pool is destroyed before the context.

Note that this currently overrides the rapids-cmake version for NVBench. Rapids-cmake should be updated and the override removed before this is merged (ping @robertmaynard).

cc: @jrhemstad @davidwendt

Authors:
  - Allison Piper (https://github.com/alliepiper)
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/15492
---
 cpp/benchmarks/fixture/nvbench_fixture.hpp    | 21 +++++++--
 cpp/benchmarks/fixture/nvbench_main.cpp       | 47 ++++++++++++-------
 .../patches/nvbench_global_setup.diff         | 29 ------------
 .../thirdparty/patches/nvbench_override.json  |  9 +---
 4 files changed, 49 insertions(+), 57 deletions(-)
 delete mode 100644 cpp/cmake/thirdparty/patches/nvbench_global_setup.diff

diff --git a/cpp/benchmarks/fixture/nvbench_fixture.hpp b/cpp/benchmarks/fixture/nvbench_fixture.hpp
index 4e4eec3547f..ac0cab4071b 100644
--- a/cpp/benchmarks/fixture/nvbench_fixture.hpp
+++ b/cpp/benchmarks/fixture/nvbench_fixture.hpp
@@ -45,6 +45,8 @@ static std::string cuio_host_mem_param{
  * Initializes the default memory resource to use the RMM pool device resource.
  */
 struct nvbench_base_fixture {
+  using host_pooled_mr_t = rmm::mr::pool_memory_resource<rmm::mr::pinned_host_memory_resource>;
+
   inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
 
   inline auto make_pool()
@@ -90,12 +92,14 @@ struct nvbench_base_fixture {
 
   inline rmm::host_async_resource_ref make_cuio_host_pinned_pool()
   {
-    using host_pooled_mr = rmm::mr::pool_memory_resource<rmm::mr::pinned_host_memory_resource>;
-    static std::shared_ptr<host_pooled_mr> mr = std::make_shared<host_pooled_mr>(
-      std::make_shared<rmm::mr::pinned_host_memory_resource>().get(),
-      size_t{1} * 1024 * 1024 * 1024);
+    if (!this->host_pooled_mr) {
+      // Don't store in static, as the CUDA context may be destroyed before static destruction
+      this->host_pooled_mr = std::make_shared<host_pooled_mr_t>(
+        std::make_shared<rmm::mr::pinned_host_memory_resource>().get(),
+        size_t{1} * 1024 * 1024 * 1024);
+    }
 
-    return *mr;
+    return *this->host_pooled_mr;
   }
 
   inline rmm::host_async_resource_ref create_cuio_host_memory_resource(std::string const& mode)
@@ -126,9 +130,16 @@ struct nvbench_base_fixture {
     std::cout << "CUIO host memory resource = " << cuio_host_mode << "\n";
   }
 
+  ~nvbench_base_fixture()
+  {
+    // Ensure the the pool is freed before the CUDA context is destroyed:
+    cudf::io::set_host_memory_resource(this->make_cuio_host_pinned());
+  }
+
   std::shared_ptr<rmm::mr::device_memory_resource> mr;
   std::string rmm_mode{"pool"};
 
+  std::shared_ptr<host_pooled_mr_t> host_pooled_mr;
   std::string cuio_host_mode{"pinned"};
 };
 
diff --git a/cpp/benchmarks/fixture/nvbench_main.cpp b/cpp/benchmarks/fixture/nvbench_main.cpp
index f46cb11a6c3..5dfd67b1c54 100644
--- a/cpp/benchmarks/fixture/nvbench_main.cpp
+++ b/cpp/benchmarks/fixture/nvbench_main.cpp
@@ -15,29 +15,44 @@
  */
 
 #include <benchmarks/fixture/nvbench_fixture.hpp>
-#define NVBENCH_ENVIRONMENT cudf::nvbench_base_fixture
 
 #include <nvbench/main.cuh>
 
+#include <string>
 #include <vector>
 
+namespace cudf {
+
 // strip off the rmm_mode and cuio_host_mem parameters before passing the
 // remaining arguments to nvbench::option_parser
-#undef NVBENCH_MAIN_PARSE
-#define NVBENCH_MAIN_PARSE(argc, argv)                     \
-  nvbench::option_parser parser;                           \
-  std::vector<std::string> m_args;                         \
-  for (int i = 0; i < argc; ++i) {                         \
-    std::string arg = argv[i];                             \
-    if (arg == cudf::detail::rmm_mode_param) {             \
-      i += 2;                                              \
-    } else if (arg == cudf::detail::cuio_host_mem_param) { \
-      i += 2;                                              \
-    } else {                                               \
-      m_args.push_back(arg);                               \
-    }                                                      \
-  }                                                        \
-  parser.parse(m_args)
+void benchmark_arg_handler(std::vector<std::string>& args)
+{
+  std::vector<std::string> _cudf_tmp_args;
+
+  for (std::size_t i = 0; i < args.size(); ++i) {
+    std::string arg = args[i];
+    if (arg == cudf::detail::rmm_mode_param) {
+      i++;  // skip the next argument
+    } else if (arg == cudf::detail::cuio_host_mem_param) {
+      i++;  // skip the next argument
+    } else {
+      _cudf_tmp_args.push_back(arg);
+    }
+  }
+
+  args = _cudf_tmp_args;
+}
+
+}  // namespace cudf
+
+// Install arg handler
+#undef NVBENCH_MAIN_CUSTOM_ARGS_HANDLER
+#define NVBENCH_MAIN_CUSTOM_ARGS_HANDLER(args) cudf::benchmark_arg_handler(args)
+
+// Global fixture setup:
+#undef NVBENCH_MAIN_INITIALIZE_CUSTOM_POST
+#define NVBENCH_MAIN_INITIALIZE_CUSTOM_POST(argc, argv) \
+  [[maybe_unused]] auto env_state = cudf::nvbench_base_fixture(argc, argv);
 
 // this declares/defines the main() function using the definitions above
 NVBENCH_MAIN
diff --git a/cpp/cmake/thirdparty/patches/nvbench_global_setup.diff b/cpp/cmake/thirdparty/patches/nvbench_global_setup.diff
deleted file mode 100644
index 04f96f49b48..00000000000
--- a/cpp/cmake/thirdparty/patches/nvbench_global_setup.diff
+++ /dev/null
@@ -1,29 +0,0 @@
-diff --git a/nvbench/main.cuh b/nvbench/main.cuh
-index 0ba82d7..cca5273 100644
---- a/nvbench/main.cuh
-+++ b/nvbench/main.cuh
-@@ -54,6 +54,16 @@
- // clang-format on
- #endif
-
-+#ifndef NVBENCH_ENVIRONMENT
-+namespace nvbench {
-+struct no_environment
-+{
-+  no_environment(int, char const *const *) {}
-+};
-+}
-+#define NVBENCH_ENVIRONMENT nvbench::no_environment
-+#endif
-+
- #define NVBENCH_MAIN_PARSE(argc, argv)                                                             \
-   nvbench::option_parser parser;                                                                   \
-   parser.parse(argc, argv)
-@@ -77,6 +87,7 @@
-     printer.set_total_state_count(total_states);                                                   \
-                                                                                                    \
-     printer.set_completed_state_count(0);                                                          \
-+    [[maybe_unused]] auto env_state = NVBENCH_ENVIRONMENT(argc, argv);                             \
-     for (auto &bench_ptr : benchmarks)                                                             \
-     {                                                                                              \
-       bench_ptr->set_printer(printer);                                                             \
diff --git a/cpp/cmake/thirdparty/patches/nvbench_override.json b/cpp/cmake/thirdparty/patches/nvbench_override.json
index ad9b19c29c1..ef0deb4c1e9 100644
--- a/cpp/cmake/thirdparty/patches/nvbench_override.json
+++ b/cpp/cmake/thirdparty/patches/nvbench_override.json
@@ -2,13 +2,8 @@
 {
   "packages" : {
     "nvbench" : {
-      "patches" : [
-        {
-          "file" : "${current_json_dir}/nvbench_global_setup.diff",
-          "issue" : "Fix add support for global setup to initialize RMM in nvbench [https://github.com/NVIDIA/nvbench/pull/123]",
-          "fixed_in" : ""
-        }
-      ]
+      "git_url": "https://github.com/NVIDIA/nvbench.git",
+      "git_tag": "555d628e9b250868c9da003e4407087ff1982e8e"
     }
   }
 }

From 6780e59fed3e1a72a64a06be8d41d1747782eda5 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Tue, 23 Apr 2024 14:39:22 -0700
Subject: [PATCH 119/272] Add some missing optional fields to the Parquet
 RowGroup metadata (#15421)

This PR adds the `sorting_columns`, `file_offset`, `total_compressed_size`, and `ordinal` optional fields to the Parquet `RowGroup` metadata object.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Mike Wilson (https://github.com/hyperbolic2346)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15421
---
 cpp/include/cudf/io/parquet.hpp               | 64 +++++++++++++++++++
 cpp/src/io/functions.cpp                      | 14 ++++
 .../io/parquet/compact_protocol_reader.cpp    | 22 ++++++-
 .../io/parquet/compact_protocol_reader.hpp    |  3 +-
 .../io/parquet/compact_protocol_writer.cpp    | 22 +++++++
 .../io/parquet/compact_protocol_writer.hpp    |  3 +
 cpp/src/io/parquet/parquet.hpp                | 25 +++++++-
 cpp/src/io/parquet/writer_impl.cu             | 37 +++++++++--
 cpp/src/io/parquet/writer_impl.hpp            |  1 +
 cpp/tests/io/parquet_writer_test.cpp          | 20 ++++++
 10 files changed, 202 insertions(+), 9 deletions(-)

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index f58bc48a37d..0406d6e3e4c 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -516,6 +516,15 @@ class chunked_parquet_reader {
  * @file
  */
 
+/**
+ * @brief Struct used to describe column sorting metadata
+ */
+struct sorting_column {
+  int column_idx{};           //!< leaf column index within the row group
+  bool is_descending{false};  //!< true if sort order is descending
+  bool is_nulls_first{true};  //!< true if nulls come before non-null values
+};
+
 class parquet_writer_options_builder;
 
 /**
@@ -564,6 +573,8 @@ class parquet_writer_options {
   std::shared_ptr<writer_compression_statistics> _compression_stats;
   // write V2 page headers?
   bool _v2_page_headers = false;
+  // Which columns in _table are used for sorting
+  std::optional<std::vector<sorting_column>> _sorting_columns;
 
   /**
    * @brief Constructor from sink and table.
@@ -762,6 +773,13 @@ class parquet_writer_options {
    */
   [[nodiscard]] auto is_enabled_write_v2_headers() const { return _v2_page_headers; }
 
+  /**
+   * @brief Returns the sorting_columns.
+   *
+   * @return Column sort order metadata
+   */
+  [[nodiscard]] auto const& get_sorting_columns() const { return _sorting_columns; }
+
   /**
    * @brief Sets partitions.
    *
@@ -893,6 +911,16 @@ class parquet_writer_options {
    * @param val Boolean value to enable/disable writing of V2 page headers.
    */
   void enable_write_v2_headers(bool val) { _v2_page_headers = val; }
+
+  /**
+   * @brief Sets sorting columns.
+   *
+   * @param sorting_columns Column sort order metadata
+   */
+  void set_sorting_columns(std::vector<sorting_column> sorting_columns)
+  {
+    _sorting_columns = std::move(sorting_columns);
+  }
 };
 
 /**
@@ -1144,6 +1172,14 @@ class parquet_writer_options_builder {
    */
   parquet_writer_options_builder& write_v2_headers(bool enabled);
 
+  /**
+   * @brief Sets column sorting metadata to chunked_parquet_writer_options.
+   *
+   * @param sorting_columns Column sort order metadata
+   * @return this for chaining
+   */
+  parquet_writer_options_builder& sorting_columns(std::vector<sorting_column> sorting_columns);
+
   /**
    * @brief move parquet_writer_options member once it's built.
    */
@@ -1231,6 +1267,8 @@ class chunked_parquet_writer_options {
   std::shared_ptr<writer_compression_statistics> _compression_stats;
   // write V2 page headers?
   bool _v2_page_headers = false;
+  // Which columns in _table are used for sorting
+  std::optional<std::vector<sorting_column>> _sorting_columns;
 
   /**
    * @brief Constructor from sink.
@@ -1385,6 +1423,13 @@ class chunked_parquet_writer_options {
    */
   [[nodiscard]] auto is_enabled_write_v2_headers() const { return _v2_page_headers; }
 
+  /**
+   * @brief Returns the sorting_columns.
+   *
+   * @return Column sort order metadata
+   */
+  [[nodiscard]] auto const& get_sorting_columns() const { return _sorting_columns; }
+
   /**
    * @brief Sets metadata.
    *
@@ -1502,6 +1547,16 @@ class chunked_parquet_writer_options {
    */
   void enable_write_v2_headers(bool val) { _v2_page_headers = val; }
 
+  /**
+   * @brief Sets sorting columns.
+   *
+   * @param sorting_columns Column sort order metadata
+   */
+  void set_sorting_columns(std::vector<sorting_column> sorting_columns)
+  {
+    _sorting_columns = std::move(sorting_columns);
+  }
+
   /**
    * @brief creates builder to build chunked_parquet_writer_options.
    *
@@ -1741,6 +1796,15 @@ class chunked_parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets column sorting metadata to chunked_parquet_writer_options.
+   *
+   * @param sorting_columns Column sort order metadata
+   * @return this for chaining
+   */
+  chunked_parquet_writer_options_builder& sorting_columns(
+    std::vector<sorting_column> sorting_columns);
+
   /**
    * @brief move chunked_parquet_writer_options member once it's built.
    */
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index f0a37839810..12059dffa4e 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -801,6 +801,13 @@ parquet_writer_options_builder& parquet_writer_options_builder::write_v2_headers
   return *this;
 }
 
+parquet_writer_options_builder& parquet_writer_options_builder::sorting_columns(
+  std::vector<sorting_column> sorting_columns)
+{
+  options._sorting_columns = std::move(sorting_columns);
+  return *this;
+}
+
 void chunked_parquet_writer_options::set_key_value_metadata(
   std::vector<std::map<std::string, std::string>> metadata)
 {
@@ -889,6 +896,13 @@ chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::
   return *this;
 }
 
+chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::sorting_columns(
+  std::vector<sorting_column> sorting_columns)
+{
+  options._sorting_columns = std::move(sorting_columns);
+  return *this;
+}
+
 chunked_parquet_writer_options_builder&
 chunked_parquet_writer_options_builder::max_page_fragment_size(size_type val)
 {
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index d39d832c18c..04a22b41247 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -16,6 +16,8 @@
 
 #include "compact_protocol_reader.hpp"
 
+#include "parquet.hpp"
+
 #include <cudf/utilities/error.hpp>
 
 #include <algorithm>
@@ -171,6 +173,7 @@ class parquet_field_int : public parquet_field {
 };
 
 using parquet_field_int8  = parquet_field_int<int8_t, FieldType::I8>;
+using parquet_field_int16 = parquet_field_int<int16_t, FieldType::I16>;
 using parquet_field_int32 = parquet_field_int<int32_t, FieldType::I32>;
 using parquet_field_int64 = parquet_field_int<int64_t, FieldType::I64>;
 
@@ -618,9 +621,18 @@ void CompactProtocolReader::read(IntType* i)
 
 void CompactProtocolReader::read(RowGroup* r)
 {
+  using optional_i16 = parquet_field_optional<int16_t, parquet_field_int16>;
+  using optional_i64 = parquet_field_optional<int64_t, parquet_field_int64>;
+  using optional_list_sorting_column =
+    parquet_field_optional<std::vector<SortingColumn>, parquet_field_struct_list<SortingColumn>>;
+
   auto op = std::make_tuple(parquet_field_struct_list(1, r->columns),
                             parquet_field_int64(2, r->total_byte_size),
-                            parquet_field_int64(3, r->num_rows));
+                            parquet_field_int64(3, r->num_rows),
+                            optional_list_sorting_column(4, r->sorting_columns),
+                            optional_i64(5, r->file_offset),
+                            optional_i64(6, r->total_compressed_size),
+                            optional_i16(7, r->ordinal));
   function_builder(this, op);
 }
 
@@ -762,6 +774,14 @@ void CompactProtocolReader::read(ColumnOrder* c)
   function_builder(this, op);
 }
 
+void CompactProtocolReader::read(SortingColumn* s)
+{
+  auto op = std::make_tuple(parquet_field_int32(1, s->column_idx),
+                            parquet_field_bool(2, s->descending),
+                            parquet_field_bool(3, s->nulls_first));
+  function_builder(this, op);
+}
+
 /**
  * @brief Constructs the schema from the file-level metadata
  *
diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
index f244df07176..2ad336a3052 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.hpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -120,6 +120,7 @@ class CompactProtocolReader {
   void read(ColumnIndex* c);
   void read(Statistics* s);
   void read(ColumnOrder* c);
+  void read(SortingColumn* s);
 
  public:
   static int NumRequiredBits(uint32_t max_level) noexcept
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index d610ec6c546..1262ca1926d 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -16,6 +16,8 @@
 
 #include "compact_protocol_writer.hpp"
 
+#include "parquet.hpp"
+
 #include <cudf/utilities/error.hpp>
 
 namespace cudf::io::parquet::detail {
@@ -140,6 +142,10 @@ size_t CompactProtocolWriter::write(RowGroup const& r)
   c.field_struct_list(1, r.columns);
   c.field_int(2, r.total_byte_size);
   c.field_int(3, r.num_rows);
+  if (r.sorting_columns.has_value()) { c.field_struct_list(4, r.sorting_columns.value()); }
+  if (r.file_offset.has_value()) { c.field_int(5, r.file_offset.value()); }
+  if (r.total_compressed_size.has_value()) { c.field_int(6, r.total_compressed_size.value()); }
+  if (r.ordinal.has_value()) { c.field_int16(7, r.ordinal.value()); }
   return c.value();
 }
 
@@ -242,6 +248,15 @@ size_t CompactProtocolWriter::write(ColumnOrder const& co)
   return c.value();
 }
 
+size_t CompactProtocolWriter::write(SortingColumn const& sc)
+{
+  CompactProtocolFieldWriter c(*this);
+  c.field_int(1, sc.column_idx);
+  c.field_bool(2, sc.descending);
+  c.field_bool(3, sc.nulls_first);
+  return c.value();
+}
+
 void CompactProtocolFieldWriter::put_byte(uint8_t v) { writer.m_buf.push_back(v); }
 
 void CompactProtocolFieldWriter::put_byte(uint8_t const* raw, uint32_t len)
@@ -292,6 +307,13 @@ inline void CompactProtocolFieldWriter::field_int8(int field, int8_t val)
   current_field_value = field;
 }
 
+inline void CompactProtocolFieldWriter::field_int16(int field, int16_t val)
+{
+  put_field_header(field, current_field_value, FieldType::I16);
+  put_int(val);
+  current_field_value = field;
+}
+
 inline void CompactProtocolFieldWriter::field_int(int field, int32_t val)
 {
   put_field_header(field, current_field_value, FieldType::I32);
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index 2ed7c078f8b..2e39abadd24 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -53,6 +53,7 @@ class CompactProtocolWriter {
   size_t write(OffsetIndex const&);
   size_t write(SizeStatistics const&);
   size_t write(ColumnOrder const&);
+  size_t write(SortingColumn const&);
 
  protected:
   std::vector<uint8_t>& m_buf;
@@ -91,6 +92,8 @@ class CompactProtocolFieldWriter {
 
   inline void field_int8(int field, int8_t val);
 
+  inline void field_int16(int field, int16_t val);
+
   inline void field_int(int field, int32_t val);
 
   inline void field_int(int field, int64_t val);
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 08f9fae145b..7f00d63b9c2 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -322,6 +322,15 @@ struct ColumnIndex {
   thrust::optional<std::vector<int64_t>> definition_level_histogram;
 };
 
+/**
+ * @brief Thrift-derived struct describing column sort order
+ */
+struct SortingColumn {
+  int32_t column_idx;  // The column index (in this row group)
+  bool descending;     // If true, indicates this column is sorted in descending order
+  bool nulls_first;    // If true, nulls will come before non-null values
+};
+
 /**
  * @brief Thrift-derived struct describing a column chunk
  */
@@ -374,9 +383,21 @@ struct ColumnChunk {
  * consisting of a column chunk for each column.
  */
 struct RowGroup {
-  int64_t total_byte_size = 0;
+  // Metadata for each column chunk in this row group.
   std::vector<ColumnChunk> columns;
+  // Total byte size of all the uncompressed column data in this row group
+  int64_t total_byte_size = 0;
+  // Number of rows in this row group
   int64_t num_rows = 0;
+  // If set, specifies a sort ordering of the rows in this RowGroup.
+  // The sorting columns can be a subset of all the columns.
+  thrust::optional<std::vector<SortingColumn>> sorting_columns;
+  // Byte offset from beginning of file to first page (data or dictionary) in this row group
+  thrust::optional<int64_t> file_offset;
+  // Total byte size of all compressed (and potentially encrypted) column data in this row group
+  thrust::optional<int64_t> total_compressed_size;
+  // Row group ordinal in the file
+  thrust::optional<int16_t> ordinal;
 };
 
 /**
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 823a08084ee..6a8c31fb96b 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -51,6 +51,7 @@
 
 #include <algorithm>
 #include <cstring>
+#include <iterator>
 #include <numeric>
 #include <utility>
 
@@ -2139,6 +2140,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         }
 
         row_group.total_byte_size += ck.bfr_size;
+        row_group.total_compressed_size =
+          row_group.total_compressed_size.value_or(0) + ck.compressed_size;
         column_chunk_meta.total_uncompressed_size = ck.bfr_size;
         column_chunk_meta.total_compressed_size   = ck.compressed_size;
       }
@@ -2236,6 +2239,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _int96_timestamps(options.is_enabled_int96_timestamps()),
     _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
+    _sorting_columns(options.get_sorting_columns()),
     _column_index_truncate_length(options.get_column_index_truncate_length()),
     _kv_meta(options.get_key_value_metadata()),
     _single_write_mode(mode),
@@ -2265,6 +2269,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _int96_timestamps(options.is_enabled_int96_timestamps()),
     _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
+    _sorting_columns(options.get_sorting_columns()),
     _column_index_truncate_length(options.get_column_index_truncate_length()),
     _kv_meta(options.get_key_value_metadata()),
     _single_write_mode(mode),
@@ -2408,12 +2413,15 @@ void writer::impl::write_parquet_data_to_sink(
           _out_sink[p]->host_write(bounce_buffer.data(), ck.compressed_size);
         }
 
+        auto const chunk_offset = _current_chunk_offset[p];
         auto& column_chunk_meta = row_group.columns[i].meta_data;
         column_chunk_meta.data_page_offset =
-          _current_chunk_offset[p] + ((ck.use_dictionary) ? ck.dictionary_size : 0);
-        column_chunk_meta.dictionary_page_offset =
-          (ck.use_dictionary) ? _current_chunk_offset[p] : 0;
+          chunk_offset + ((ck.use_dictionary) ? ck.dictionary_size : 0);
+        column_chunk_meta.dictionary_page_offset = (ck.use_dictionary) ? chunk_offset : 0;
         _current_chunk_offset[p] += ck.compressed_size;
+
+        // save location of first page in row group
+        if (i == 0) { row_group.file_offset = chunk_offset; }
       }
     }
     for (auto const& task : write_tasks) {
@@ -2488,10 +2496,9 @@ std::unique_ptr<std::vector<uint8_t>> writer::impl::close(
     std::vector<uint8_t> buffer;
     CompactProtocolWriter cpw(&buffer);
     file_ender_s fendr;
+    auto& fmd = _agg_meta->file(p);
 
     if (_stats_granularity == statistics_freq::STATISTICS_COLUMN) {
-      auto& fmd = _agg_meta->file(p);
-
       // write column indices, updating column metadata along the way
       int chunkidx = 0;
       for (auto& r : fmd.row_groups) {
@@ -2517,6 +2524,26 @@ std::unique_ptr<std::vector<uint8_t>> writer::impl::close(
       }
     }
 
+    // set row group ordinals
+    auto iter        = thrust::make_counting_iterator(0);
+    auto& row_groups = fmd.row_groups;
+    std::for_each(
+      iter, iter + row_groups.size(), [&row_groups](auto idx) { row_groups[idx].ordinal = idx; });
+
+    // set sorting_columns on row groups
+    if (_sorting_columns.has_value()) {
+      // convert `sorting_column` to `SortingColumn`
+      auto const& sorting_cols = _sorting_columns.value();
+      std::vector<SortingColumn> scols;
+      std::transform(
+        sorting_cols.begin(), sorting_cols.end(), std::back_inserter(scols), [](auto const& sc) {
+          return SortingColumn{sc.column_idx, sc.is_descending, sc.is_nulls_first};
+        });
+      // and copy to each row group
+      std::for_each(iter, iter + row_groups.size(), [&row_groups, &scols](auto idx) {
+        row_groups[idx].sorting_columns = scols;
+      });
+    }
     buffer.resize(0);
     fendr.footer_len = static_cast<uint32_t>(cpw.write(_agg_meta->get_metadata(p)));
     fendr.magic      = parquet_magic;
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 3cbb7630fab..784f78f06d5 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -156,6 +156,7 @@ class writer::impl {
   bool const _int96_timestamps;
   bool const _utc_timestamps;
   bool const _write_v2_headers;
+  std::optional<std::vector<sorting_column>> _sorting_columns;
   int32_t const _column_index_truncate_length;
   std::vector<std::map<std::string, std::string>> const _kv_meta;  // Optional user metadata.
   cudf::io::detail::single_write_mode const
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index caddfee9f02..3a3040f0957 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -27,6 +27,7 @@
 #include <cudf/io/types.hpp>
 #include <cudf/unary.hpp>
 
+#include <src/io/parquet/parquet.hpp>
 #include <src/io/parquet/parquet_common.hpp>
 
 #include <fstream>
@@ -1513,6 +1514,7 @@ TEST_F(ParquetWriterTest, RowGroupMetadata)
   cudf::io::parquet_writer_options opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table)
       .dictionary_policy(cudf::io::dictionary_policy::NEVER)
+      .sorting_columns({{0, false, false}})
       .compression(cudf::io::compression_type::ZSTD);
   cudf::io::write_parquet(opts);
 
@@ -1524,6 +1526,24 @@ TEST_F(ParquetWriterTest, RowGroupMetadata)
   ASSERT_GT(fmd.row_groups.size(), 0);
   EXPECT_GE(fmd.row_groups[0].total_byte_size,
             static_cast<int64_t>(num_rows * sizeof(column_type)));
+
+  // row group file offset should be first page location
+  EXPECT_EQ(fmd.row_groups[0].file_offset, fmd.row_groups[0].columns[0].meta_data.data_page_offset);
+
+  // ordinal should be set to 0
+  ASSERT_TRUE(fmd.row_groups[0].ordinal.has_value());
+  EXPECT_EQ(fmd.row_groups[0].ordinal.value(), 0);
+
+  // only one column, so total_compressed_size should equal compressed size of first chunk
+  ASSERT_TRUE(fmd.row_groups[0].total_compressed_size.has_value());
+  EXPECT_EQ(fmd.row_groups[0].total_compressed_size.value(),
+            fmd.row_groups[0].columns[0].meta_data.total_compressed_size);
+
+  // test that sorting order was written correctly
+  ASSERT_TRUE(fmd.row_groups[0].sorting_columns.has_value());
+  EXPECT_EQ(fmd.row_groups[0].sorting_columns.value()[0].column_idx, 0);
+  EXPECT_FALSE(fmd.row_groups[0].sorting_columns.value()[0].descending);
+  EXPECT_FALSE(fmd.row_groups[0].sorting_columns.value()[0].nulls_first);
 }
 
 TEST_F(ParquetWriterTest, UserRequestedDictFallback)

From 8db1851106e3a250609294a81502f5abff801f67 Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Tue, 23 Apr 2024 17:42:57 -0400
Subject: [PATCH 120/272] Add `from_arrow_device` function to cudf interop
 using nanoarrow (#15458)

Adding a corresponding `from_arrow_device` function following up from #15047. This continues the work towards addressing #14926.

Authors:
  - Matt Topol (https://github.com/zeroshade)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/15458
---
 cpp/CMakeLists.txt                           |   1 +
 cpp/include/cudf/interop.hpp                 | 124 ++++
 cpp/src/interop/arrow_utilities.hpp          |  30 +
 cpp/src/interop/from_arrow_device.cu         | 483 ++++++++++++
 cpp/src/interop/to_arrow_device.cu           |   4 +-
 cpp/tests/CMakeLists.txt                     |  10 +-
 cpp/tests/interop/from_arrow_device_test.cpp | 732 +++++++++++++++++++
 cpp/tests/interop/nanoarrow_utils.hpp        | 169 ++---
 cpp/tests/interop/to_arrow_device_test.cpp   |  97 ++-
 9 files changed, 1488 insertions(+), 162 deletions(-)
 create mode 100644 cpp/src/interop/arrow_utilities.hpp
 create mode 100644 cpp/src/interop/from_arrow_device.cu
 create mode 100644 cpp/tests/interop/from_arrow_device_test.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index b6a61368fe7..53da710f0ea 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -359,6 +359,7 @@ add_library(
   src/interop/from_arrow.cu
   src/interop/to_arrow.cu
   src/interop/to_arrow_device.cu
+  src/interop/from_arrow_device.cu
   src/interop/to_arrow_schema.cpp
   src/interop/to_arrow_utilities.cpp
   src/interop/detail/arrow_allocator.cpp
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index defc1fc834c..bb05a622f40 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -348,5 +348,129 @@ std::unique_ptr<cudf::scalar> from_arrow(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief typedef for a vector of owning columns, used for conversion from ArrowDeviceArray
+ *
+ */
+using owned_columns_t = std::vector<std::unique_ptr<cudf::column>>;
+
+/**
+ * @brief functor for a custom deleter to a unique_ptr of table_view
+ *
+ * When converting from an ArrowDeviceArray, there are cases where data can't
+ * be zero-copy (i.e. bools or non-UINT32 dictionary indices). This custom deleter
+ * is used to maintain ownership over the data allocated since a `cudf::table_view`
+ * doesn't hold ownership.
+ */
+template <typename ViewType>
+struct custom_view_deleter {
+  /**
+   * @brief Construct a new custom view deleter object
+   *
+   * @param owned Vector of owning columns
+   */
+  explicit custom_view_deleter(owned_columns_t&& owned) : owned_mem_{std::move(owned)} {}
+
+  /**
+   * @brief operator to delete the unique_ptr
+   *
+   * @param ptr Pointer to the object to be deleted
+   */
+  void operator()(ViewType* ptr) const { delete ptr; }
+
+  owned_columns_t owned_mem_;  ///< Owned columns that must be deleted.
+};
+
+/**
+ * @brief typedef for a unique_ptr to a `cudf::table_view` with custom deleter
+ *
+ */
+using unique_table_view_t =
+  std::unique_ptr<cudf::table_view, custom_view_deleter<cudf::table_view>>;
+
+/**
+ * @brief Create `cudf::table_view` from given `ArrowDeviceArray` and `ArrowSchema`
+ *
+ * Constructs a non-owning `cudf::table_view` using `ArrowDeviceArray` and `ArrowSchema`,
+ * data must be accessible to the CUDA device. Because the resulting `cudf::table_view` will
+ * not own the data, the `ArrowDeviceArray` must be kept alive for the lifetime of the result.
+ * It is the responsibility of callers to ensure they call the release callback on the
+ * `ArrowDeviceArray` after it is no longer needed, and that the `cudf::table_view` is not
+ * accessed after this happens.
+ *
+ * @throws cudf::logic_error if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST`
+ * or `ARROW_DEVICE_CUDA_MANAGED`
+ *
+ * @throws cudf::data_type_error if the input array is not a struct array, non-struct
+ * arrays should be passed to `from_arrow_device_column` instead.
+ *
+ * @throws cudf::data_type_error if the input arrow data type is not supported.
+ *
+ * Each child of the input struct will be the columns of the resulting table_view.
+ *
+ * @note The custom deleter used for the unique_ptr to the table_view maintains ownership
+ * over any memory which is allocated, such as converting boolean columns from the bitmap
+ * used by Arrow to the 1-byte per value for cudf.
+ *
+ * @note If the input `ArrowDeviceArray` contained a non-null sync_event it is assumed
+ * to be a `cudaEvent_t*` and the passed in stream will have `cudaStreamWaitEvent` called
+ * on it with the event. This function, however, will not explicitly synchronize on the
+ * stream.
+ *
+ * @param schema `ArrowSchema` pointer to object describing the type of the device array
+ * @param input `ArrowDeviceArray` pointer to object owning the Arrow data
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to perform any allocations
+ * @return `cudf::table_view` generated from given Arrow data
+ */
+unique_table_view_t from_arrow_device(
+  ArrowSchema const* schema,
+  ArrowDeviceArray const* input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief typedef for a unique_ptr to a `cudf::column_view` with custom deleter
+ *
+ */
+using unique_column_view_t =
+  std::unique_ptr<cudf::column_view, custom_view_deleter<cudf::column_view>>;
+
+/**
+ * @brief Create `cudf::column_view` from given `ArrowDeviceArray` and `ArrowSchema`
+ *
+ * Constructs a non-owning `cudf::column_view` using `ArrowDeviceArray` and `ArrowSchema`,
+ * data must be accessible to the CUDA device. Because the resulting `cudf::column_view` will
+ * not own the data, the `ArrowDeviceArray` must be kept alive for the lifetime of the result.
+ * It is the responsibility of callers to ensure they call the release callback on the
+ * `ArrowDeviceArray` after it is no longer needed, and that the `cudf::column_view` is not
+ * accessed after this happens.
+ *
+ * @throws cudf::logic_error if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST`
+ * or `ARROW_DEVICE_CUDA_MANAGED`
+ *
+ * @throws cudf::data_type_error input arrow data type is not supported.
+ *
+ * @note The custom deleter used for the unique_ptr to the table_view maintains ownership
+ * over any memory which is allocated, such as converting boolean columns from the bitmap
+ * used by Arrow to the 1-byte per value for cudf.
+ *
+ * @note If the input `ArrowDeviceArray` contained a non-null sync_event it is assumed
+ * to be a `cudaEvent_t*` and the passed in stream will have `cudaStreamWaitEvent` called
+ * on it with the event. This function, however, will not explicitly synchronize on the
+ * stream.
+ *
+ * @param schema `ArrowSchema` pointer to object describing the type of the device array
+ * @param input `ArrowDeviceArray` pointer to object owning the Arrow data
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to perform any allocations
+ * @return `cudf::column_view` generated from given Arrow data
+ */
+unique_column_view_t from_arrow_device_column(
+  ArrowSchema const* schema,
+  ArrowDeviceArray const* input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/src/interop/arrow_utilities.hpp b/cpp/src/interop/arrow_utilities.hpp
new file mode 100644
index 00000000000..9bbdaa2c363
--- /dev/null
+++ b/cpp/src/interop/arrow_utilities.hpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief constants for buffer indexes of Arrow arrays
+ *
+ */
+static constexpr int validity_buffer_idx         = 0;
+static constexpr int fixed_width_data_buffer_idx = 1;
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu
new file mode 100644
index 00000000000..d4d31d1989b
--- /dev/null
+++ b/cpp/src/interop/from_arrow_device.cu
@@ -0,0 +1,483 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arrow_utilities.hpp"
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/interop.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/transform.hpp>
+#include <cudf/detail/unary.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/interop/detail/arrow.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+
+#include <nanoarrow/nanoarrow.h>
+#include <nanoarrow/nanoarrow.hpp>
+
+namespace cudf {
+
+namespace detail {
+data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view)
+{
+  switch (arrow_view->type) {
+    case NANOARROW_TYPE_NA: return data_type(type_id::EMPTY);
+    case NANOARROW_TYPE_BOOL: return data_type(type_id::BOOL8);
+    case NANOARROW_TYPE_INT8: return data_type(type_id::INT8);
+    case NANOARROW_TYPE_INT16: return data_type(type_id::INT16);
+    case NANOARROW_TYPE_INT32: return data_type(type_id::INT32);
+    case NANOARROW_TYPE_INT64: return data_type(type_id::INT64);
+    case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8);
+    case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16);
+    case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32);
+    case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64);
+    case NANOARROW_TYPE_FLOAT: return data_type(type_id::FLOAT32);
+    case NANOARROW_TYPE_DOUBLE: return data_type(type_id::FLOAT64);
+    case NANOARROW_TYPE_DATE32: return data_type(type_id::TIMESTAMP_DAYS);
+    case NANOARROW_TYPE_STRING: return data_type(type_id::STRING);
+    case NANOARROW_TYPE_LIST: return data_type(type_id::LIST);
+    case NANOARROW_TYPE_DICTIONARY: return data_type(type_id::DICTIONARY32);
+    case NANOARROW_TYPE_STRUCT: return data_type(type_id::STRUCT);
+    case NANOARROW_TYPE_TIMESTAMP: {
+      switch (arrow_view->time_unit) {
+        case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::TIMESTAMP_SECONDS);
+        case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::TIMESTAMP_MILLISECONDS);
+        case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::TIMESTAMP_MICROSECONDS);
+        case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::TIMESTAMP_NANOSECONDS);
+        default: CUDF_FAIL("Unsupported timestamp unit in arrow", cudf::data_type_error);
+      }
+    }
+    case NANOARROW_TYPE_DURATION: {
+      switch (arrow_view->time_unit) {
+        case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::DURATION_SECONDS);
+        case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::DURATION_MILLISECONDS);
+        case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::DURATION_MICROSECONDS);
+        case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::DURATION_NANOSECONDS);
+        default: CUDF_FAIL("Unsupported duration unit in arrow", cudf::data_type_error);
+      }
+    }
+    case NANOARROW_TYPE_DECIMAL128:
+      return data_type{type_id::DECIMAL128, -arrow_view->decimal_scale};
+    default: CUDF_FAIL("Unsupported type_id conversion to cudf", cudf::data_type_error);
+  }
+}
+
+namespace {
+
+using dispatch_tuple_t = std::tuple<column_view, owned_columns_t>;
+
+struct dispatch_from_arrow_device {
+  template <typename T,
+            CUDF_ENABLE_IF(not is_rep_layout_compatible<T>() &&
+                           !std::is_same_v<T, numeric::decimal128>)>
+  dispatch_tuple_t operator()(ArrowSchemaView*,
+                              ArrowArray const*,
+                              data_type,
+                              bool,
+                              rmm::cuda_stream_view,
+                              rmm::mr::device_memory_resource*)
+  {
+    CUDF_FAIL("Unsupported type in from_arrow_device", cudf::data_type_error);
+  }
+
+  template <typename T,
+            CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || std::is_same_v<T, numeric::decimal128>)>
+  dispatch_tuple_t operator()(ArrowSchemaView* schema,
+                              ArrowArray const* input,
+                              data_type type,
+                              bool skip_mask,
+                              rmm::cuda_stream_view,
+                              rmm::mr::device_memory_resource*)
+  {
+    size_type const num_rows   = input->length;
+    size_type const offset     = input->offset;
+    size_type const null_count = input->null_count;
+    bitmask_type const* null_mask =
+      skip_mask ? nullptr
+                : reinterpret_cast<bitmask_type const*>(input->buffers[validity_buffer_idx]);
+    auto data_buffer = input->buffers[fixed_width_data_buffer_idx];
+    return std::make_tuple<column_view, owned_columns_t>(
+      {type, num_rows, data_buffer, null_mask, null_count, offset}, {});
+  }
+};
+
+// forward declaration is needed because `type_dispatch` instantiates the
+// dispatch_from_arrow_device struct causing a recursive situation for struct,
+// dictionary and list_view types.
+dispatch_tuple_t get_column(ArrowSchemaView* schema,
+                            ArrowArray const* input,
+                            data_type type,
+                            bool skip_mask,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr);
+
+template <>
+dispatch_tuple_t dispatch_from_arrow_device::operator()<bool>(ArrowSchemaView* schema,
+                                                              ArrowArray const* input,
+                                                              data_type type,
+                                                              bool skip_mask,
+                                                              rmm::cuda_stream_view stream,
+                                                              rmm::mr::device_memory_resource* mr)
+{
+  if (input->length == 0) {
+    return std::make_tuple<column_view, owned_columns_t>(
+      {type,
+       0,
+       nullptr,
+       skip_mask ? nullptr
+                 : reinterpret_cast<bitmask_type const*>(input->buffers[validity_buffer_idx]),
+       0},
+      {});
+  }
+
+  auto out_col = mask_to_bools(
+    reinterpret_cast<bitmask_type const*>(input->buffers[fixed_width_data_buffer_idx]),
+    input->offset,
+    input->offset + input->length,
+    stream,
+    mr);
+  auto const has_nulls = skip_mask ? false : input->buffers[validity_buffer_idx] != nullptr;
+  if (has_nulls) {
+    auto out_mask = cudf::detail::copy_bitmask(
+      reinterpret_cast<bitmask_type const*>(input->buffers[validity_buffer_idx]),
+      input->offset,
+      input->offset + input->length,
+      stream,
+      mr);
+    out_col->set_null_mask(std::move(out_mask), input->null_count);
+  }
+
+  auto out_view = out_col->view();
+  owned_columns_t owned;
+  owned.emplace_back(std::move(out_col));
+  return std::make_tuple<column_view, owned_columns_t>(std::move(out_view), std::move(owned));
+}
+
+template <>
+dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::string_view>(
+  ArrowSchemaView* schema,
+  ArrowArray const* input,
+  data_type type,
+  bool skip_mask,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  if (input->length == 0) {
+    return std::make_tuple<column_view, owned_columns_t>(
+      {type,
+       0,
+       nullptr,
+       skip_mask ? nullptr
+                 : reinterpret_cast<bitmask_type const*>(input->buffers[validity_buffer_idx]),
+       0},
+      {});
+  }
+
+  auto offsets_view = column_view{data_type(type_id::INT32),
+                                  static_cast<size_type>(input->offset + input->length) + 1,
+                                  input->buffers[fixed_width_data_buffer_idx],
+                                  nullptr,
+                                  0,
+                                  0};
+  return std::make_tuple<column_view, owned_columns_t>(
+    {type,
+     static_cast<size_type>(input->length),
+     input->buffers[2],
+     skip_mask ? nullptr
+               : reinterpret_cast<bitmask_type const*>(input->buffers[validity_buffer_idx]),
+     static_cast<size_type>(input->null_count),
+     static_cast<size_type>(input->offset),
+     {offsets_view}},
+    {});
+}
+
+template <>
+dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::dictionary32>(
+  ArrowSchemaView* schema,
+  ArrowArray const* input,
+  data_type type,
+  bool skip_mask,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  ArrowSchemaView keys_schema_view;
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaViewInit(&keys_schema_view, schema->schema->dictionary, nullptr));
+
+  auto const keys_type = arrow_to_cudf_type(&keys_schema_view);
+  auto [keys_view, owned_cols] =
+    get_column(&keys_schema_view, input->dictionary, keys_type, true, stream, mr);
+
+  auto const dict_indices_type = [&schema]() -> data_type {
+    // cudf dictionary requires an unsigned type for the indices,
+    // since it is invalid for an arrow dictionary to contain negative
+    // indices, we can safely use the unsigned equivalent without having
+    // to modify the buffers.
+    switch (schema->storage_type) {
+      case NANOARROW_TYPE_INT8:
+      case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8);
+      case NANOARROW_TYPE_INT16:
+      case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16);
+      case NANOARROW_TYPE_INT32:
+      case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32);
+      case NANOARROW_TYPE_INT64:
+      case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64);
+      default: CUDF_FAIL("Unsupported type_id for dictionary indices", cudf::data_type_error);
+    }
+  }();
+
+  size_type const num_rows   = input->length;
+  size_type const offset     = input->offset;
+  size_type const null_count = input->null_count;
+  column_view indices_view   = column_view{dict_indices_type,
+                                         offset + num_rows,
+                                         input->buffers[fixed_width_data_buffer_idx],
+                                         nullptr,
+                                         0,
+                                         0};
+
+  return std::make_tuple<column_view, owned_columns_t>(
+    {type,
+     num_rows,
+     nullptr,
+     reinterpret_cast<bitmask_type const*>(input->buffers[validity_buffer_idx]),
+     null_count,
+     offset,
+     {indices_view, keys_view}},
+    std::move(owned_cols));
+}
+
+template <>
+dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::struct_view>(
+  ArrowSchemaView* schema,
+  ArrowArray const* input,
+  data_type type,
+  bool skip_mask,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  std::vector<column_view> children;
+  owned_columns_t out_owned_cols;
+  std::transform(
+    input->children,
+    input->children + input->n_children,
+    schema->schema->children,
+    std::back_inserter(children),
+    [&out_owned_cols, &stream, &mr](ArrowArray const* child, ArrowSchema const* child_schema) {
+      ArrowSchemaView view;
+      NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, child_schema, nullptr));
+      auto type              = arrow_to_cudf_type(&view);
+      auto [out_view, owned] = get_column(&view, child, type, false, stream, mr);
+      if (out_owned_cols.empty()) {
+        out_owned_cols = std::move(owned);
+      } else {
+        out_owned_cols.insert(std::end(out_owned_cols),
+                              std::make_move_iterator(std::begin(owned)),
+                              std::make_move_iterator(std::end(owned)));
+      }
+      return out_view;
+    });
+
+  size_type const num_rows   = input->length;
+  size_type const offset     = input->offset;
+  size_type const null_count = input->null_count;
+  return std::make_tuple<column_view, owned_columns_t>(
+    {type,
+     num_rows,
+     nullptr,
+     reinterpret_cast<bitmask_type const*>(input->buffers[validity_buffer_idx]),
+     null_count,
+     offset,
+     std::move(children)},
+    std::move(out_owned_cols));
+}
+
+template <>
+dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::list_view>(
+  ArrowSchemaView* schema,
+  ArrowArray const* input,
+  data_type type,
+  bool skip_mask,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  size_type const num_rows   = input->length;
+  size_type const offset     = input->offset;
+  size_type const null_count = input->null_count;
+  auto offsets_view          = column_view{data_type(type_id::INT32),
+                                  offset + num_rows + 1,
+                                  input->buffers[fixed_width_data_buffer_idx],
+                                  nullptr,
+                                  0,
+                                  0};
+
+  ArrowSchemaView child_schema_view;
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaViewInit(&child_schema_view, schema->schema->children[0], nullptr));
+  auto child_type = arrow_to_cudf_type(&child_schema_view);
+  auto [child_view, owned] =
+    get_column(&child_schema_view, input->children[0], child_type, false, stream, mr);
+
+  // in the scenario where we were sliced and there are more elements in the child_view
+  // than can be referenced by the sliced offsets, we need to slice the child_view
+  // so that when `get_sliced_child` is called, we still produce the right result
+  auto max_child_offset = cudf::detail::get_value<int32_t>(offsets_view, offset + num_rows, stream);
+  child_view            = cudf::slice(child_view, {0, max_child_offset}, stream).front();
+
+  return std::make_tuple<column_view, owned_columns_t>(
+    {type,
+     num_rows,
+     rmm::device_buffer{0, stream, mr}.data(),
+     reinterpret_cast<bitmask_type const*>(input->buffers[validity_buffer_idx]),
+     null_count,
+     offset,
+     {offsets_view, child_view}},
+    std::move(owned));
+}
+
+dispatch_tuple_t get_column(ArrowSchemaView* schema,
+                            ArrowArray const* input,
+                            data_type type,
+                            bool skip_mask,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr)
+{
+  return type.id() != type_id::EMPTY
+           ? std::move(type_dispatcher(
+               type, dispatch_from_arrow_device{}, schema, input, type, skip_mask, stream, mr))
+           : std::make_tuple<column_view, owned_columns_t>({data_type(type_id::EMPTY),
+                                                            static_cast<size_type>(input->length),
+                                                            nullptr,
+                                                            nullptr,
+                                                            static_cast<size_type>(input->length)},
+                                                           {});
+}
+
+}  // namespace
+
+unique_table_view_t from_arrow_device(ArrowSchemaView* schema,
+                                      ArrowDeviceArray const* input,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  if (input->sync_event != nullptr) {
+    CUDF_CUDA_TRY(
+      cudaStreamWaitEvent(stream.value(), *reinterpret_cast<cudaEvent_t*>(input->sync_event)));
+  }
+
+  std::vector<column_view> columns;
+  owned_columns_t owned_mem;
+
+  auto type = arrow_to_cudf_type(schema);
+  CUDF_EXPECTS(type == data_type(type_id::STRUCT),
+               "Must pass a struct to `from_arrow_device`",
+               cudf::data_type_error);
+  std::transform(
+    input->array.children,
+    input->array.children + input->array.n_children,
+    schema->schema->children,
+    std::back_inserter(columns),
+    [&owned_mem, &stream, &mr](ArrowArray const* child, ArrowSchema const* child_schema) {
+      ArrowSchemaView view;
+      NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, child_schema, nullptr));
+      auto type              = arrow_to_cudf_type(&view);
+      auto [out_view, owned] = get_column(&view, child, type, false, stream, mr);
+      if (owned_mem.empty()) {
+        owned_mem = std::move(owned);
+      } else {
+        owned_mem.insert(std::end(owned_mem),
+                         std::make_move_iterator(std::begin(owned)),
+                         std::make_move_iterator(std::end(owned)));
+      }
+      return out_view;
+    });
+
+  return unique_table_view_t{new table_view{columns},
+                             custom_view_deleter<cudf::table_view>{std::move(owned_mem)}};
+}
+
+unique_column_view_t from_arrow_device_column(ArrowSchemaView* schema,
+                                              ArrowDeviceArray const* input,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  if (input->sync_event != nullptr) {
+    CUDF_CUDA_TRY(
+      cudaStreamWaitEvent(stream.value(), *reinterpret_cast<cudaEvent_t*>(input->sync_event)));
+  }
+
+  auto type             = arrow_to_cudf_type(schema);
+  auto [colview, owned] = get_column(schema, &input->array, type, false, stream, mr);
+  return unique_column_view_t{new column_view{colview},
+                              custom_view_deleter<cudf::column_view>{std::move(owned)}};
+}
+
+}  // namespace detail
+
+unique_table_view_t from_arrow_device(ArrowSchema const* schema,
+                                      ArrowDeviceArray const* input,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(schema != nullptr && input != nullptr,
+               "input ArrowSchema and ArrowDeviceArray must not be NULL");
+  CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CUDA ||
+                 input->device_type == ARROW_DEVICE_CUDA_HOST ||
+                 input->device_type == ARROW_DEVICE_CUDA_MANAGED,
+               "ArrowDeviceArray memory must be accessible to CUDA");
+
+  CUDF_FUNC_RANGE();
+
+  rmm::cuda_set_device_raii dev(
+    rmm::cuda_device_id{static_cast<rmm::cuda_device_id::value_type>(input->device_id)});
+  ArrowSchemaView view;
+  NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr));
+  return detail::from_arrow_device(&view, input, stream, mr);
+}
+
+unique_column_view_t from_arrow_device_column(ArrowSchema const* schema,
+                                              ArrowDeviceArray const* input,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(schema != nullptr && input != nullptr,
+               "input ArrowSchema and ArrowDeviceArray must not be NULL");
+  CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CUDA ||
+                 input->device_type == ARROW_DEVICE_CUDA_HOST ||
+                 input->device_type == ARROW_DEVICE_CUDA_MANAGED,
+               "ArrowDeviceArray must be accessible to CUDA");
+
+  CUDF_FUNC_RANGE();
+
+  rmm::cuda_set_device_raii dev(
+    rmm::cuda_device_id{static_cast<rmm::cuda_device_id::value_type>(input->device_id)});
+  ArrowSchemaView view;
+  NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr));
+  return detail::from_arrow_device_column(&view, input, stream, mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
index 737f8c7f625..f2b1669df9b 100644
--- a/cpp/src/interop/to_arrow_device.cu
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "arrow_utilities.hpp"
 #include "to_arrow_utilities.hpp"
 
 #include <cudf/column/column.hpp>
@@ -49,9 +50,6 @@ namespace cudf {
 namespace detail {
 namespace {
 
-static constexpr int validity_buffer_idx         = 0;
-static constexpr int fixed_width_data_buffer_idx = 1;
-
 template <typename T>
 void device_buffer_finalize(ArrowBufferAllocator* allocator, uint8_t*, int64_t)
 {
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index d0c2b3d2bce..f59e675e1d5 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -268,8 +268,14 @@ ConfigureTest(
 # ##################################################################################################
 # * interop tests -------------------------------------------------------------------------
 ConfigureTest(
-  INTEROP_TEST interop/to_arrow_device_test.cpp interop/to_arrow_test.cpp
-  interop/from_arrow_test.cpp interop/dlpack_test.cpp EXTRA_LIB nanoarrow
+  INTEROP_TEST
+  interop/to_arrow_device_test.cpp
+  interop/to_arrow_test.cpp
+  interop/from_arrow_test.cpp
+  interop/from_arrow_device_test.cpp
+  interop/dlpack_test.cpp
+  EXTRA_LIB
+  nanoarrow
 )
 
 # ##################################################################################################
diff --git a/cpp/tests/interop/from_arrow_device_test.cpp b/cpp/tests/interop/from_arrow_device_test.cpp
new file mode 100644
index 00000000000..95cbe8057d1
--- /dev/null
+++ b/cpp/tests/interop/from_arrow_device_test.cpp
@@ -0,0 +1,732 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nanoarrow_utils.hpp"
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/testing_main.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/interop.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/dictionary/encode.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+struct FromArrowDeviceTest : public cudf::test::BaseFixture {};
+
+template <typename T>
+struct FromArrowDeviceTestDurationsTest : public cudf::test::BaseFixture {};
+
+TYPED_TEST_SUITE(FromArrowDeviceTestDurationsTest, cudf::test::DurationTypes);
+
+TEST_F(FromArrowDeviceTest, FailConditions)
+{
+  // can't pass null for schema or device array
+  EXPECT_THROW(cudf::from_arrow_device(nullptr, nullptr), cudf::logic_error);
+  // can't pass null for device array
+  ArrowSchema schema;
+  EXPECT_THROW(cudf::from_arrow_device(&schema, nullptr), cudf::logic_error);
+  // device_type must be CUDA/CUDA_HOST/CUDA_MANAGED
+  // should fail with ARROW_DEVICE_CPU
+  ArrowDeviceArray arr;
+  arr.device_type = ARROW_DEVICE_CPU;
+  EXPECT_THROW(cudf::from_arrow_device(&schema, &arr), cudf::logic_error);
+
+  // can't pass null for schema or device array
+  EXPECT_THROW(cudf::from_arrow_device_column(nullptr, nullptr), cudf::logic_error);
+  // can't pass null for device array
+  EXPECT_THROW(cudf::from_arrow_device_column(&schema, nullptr), cudf::logic_error);
+  // device_type must be CUDA/CUDA_HOST/CUDA_MANAGED
+  // should fail with ARROW_DEVICE_CPU
+  EXPECT_THROW(cudf::from_arrow_device_column(&schema, &arr), cudf::logic_error);
+}
+
+TEST_F(FromArrowDeviceTest, EmptyTable)
+{
+  const auto [table, schema, arr] = get_nanoarrow_tables(0);
+
+  auto expected_cudf_table = table->view();
+
+  ArrowDeviceArray input;
+  memcpy(&input.array, arr.get(), sizeof(ArrowArray));
+  input.device_id   = rmm::get_current_cuda_device().value();
+  input.device_type = ARROW_DEVICE_CUDA;
+  input.sync_event  = nullptr;
+
+  auto got_cudf_table = cudf::from_arrow_device(schema.get(), &input);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table, *got_cudf_table);
+
+  auto got_cudf_col = cudf::from_arrow_device_column(schema.get(), &input);
+  EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+  cudf::table_view from_struct{
+    std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table, from_struct);
+}
+
+TEST_F(FromArrowDeviceTest, DateTimeTable)
+{
+  auto data = std::vector<int64_t>{1, 2, 3, 4, 5, 6};
+  auto col  = cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>(
+    data.begin(), data.end());
+
+  cudf::table_view expected_table_view({col});
+
+  nanoarrow::UniqueSchema input_schema;
+  ArrowSchemaInit(input_schema.get());
+  ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+  ArrowSchemaInit(input_schema->children[0]);
+  ArrowSchemaSetTypeDateTime(
+    input_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr);
+  ArrowSchemaSetName(input_schema->children[0], "a");
+
+  nanoarrow::UniqueArray input_array;
+  ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+  input_array->length                  = 6;
+  input_array->null_count              = 0;
+  input_array->children[0]->length     = 6;
+  input_array->children[0]->null_count = 0;
+  ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc);
+  ArrowArrayBuffer(input_array->children[0], 1)->data =
+    const_cast<uint8_t*>(cudf::column_view(col).data<uint8_t>());
+  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr);
+
+  ArrowDeviceArray input_device_array;
+  input_device_array.device_id   = rmm::get_current_cuda_device().value();
+  input_device_array.device_type = ARROW_DEVICE_CUDA;
+  input_device_array.sync_event  = nullptr;
+  memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+  auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, *got_cudf_table_view);
+
+  auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+  EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+  cudf::table_view from_struct{
+    std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+}
+
+TYPED_TEST(FromArrowDeviceTestDurationsTest, DurationTable)
+{
+  using T = TypeParam;
+
+  if (cudf::type_to_id<TypeParam>() == cudf::type_id::DURATION_DAYS) { return; }
+
+  auto data = {T{1}, T{2}, T{3}, T{4}, T{5}, T{6}};
+  auto col  = cudf::test::fixed_width_column_wrapper<T>(data);
+
+  cudf::table_view expected_table_view({col});
+  const ArrowTimeUnit time_unit = [&] {
+    switch (cudf::type_to_id<TypeParam>()) {
+      case cudf::type_id::DURATION_SECONDS: return NANOARROW_TIME_UNIT_SECOND;
+      case cudf::type_id::DURATION_MILLISECONDS: return NANOARROW_TIME_UNIT_MILLI;
+      case cudf::type_id::DURATION_MICROSECONDS: return NANOARROW_TIME_UNIT_MICRO;
+      case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TIME_UNIT_NANO;
+      default: CUDF_FAIL("Unsupported duration unit in arrow");
+    }
+  }();
+
+  nanoarrow::UniqueSchema input_schema;
+  ArrowSchemaInit(input_schema.get());
+  ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+
+  ArrowSchemaInit(input_schema->children[0]);
+  ArrowSchemaSetTypeDateTime(
+    input_schema->children[0], NANOARROW_TYPE_DURATION, time_unit, nullptr);
+  ArrowSchemaSetName(input_schema->children[0], "a");
+
+  auto data_ptr = expected_table_view.column(0).data<uint8_t>();
+  nanoarrow::UniqueArray input_array;
+  ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+  input_array->length                  = expected_table_view.num_rows();
+  input_array->null_count              = 0;
+  input_array->children[0]->length     = expected_table_view.num_rows();
+  input_array->children[0]->null_count = 0;
+  ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc);
+  ArrowArrayBuffer(input_array->children[0], 1)->data = const_cast<uint8_t*>(data_ptr);
+  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr);
+
+  ArrowDeviceArray input_device_array;
+  input_device_array.device_id   = rmm::get_current_cuda_device().value();
+  input_device_array.device_type = ARROW_DEVICE_CUDA;
+  input_device_array.sync_event  = nullptr;
+  memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+  auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, *got_cudf_table_view);
+
+  auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+  EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+  cudf::table_view from_struct{
+    std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+}
+
+TEST_F(FromArrowDeviceTest, NestedList)
+{
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 != 0; });
+  auto col = cudf::test::lists_column_wrapper<int64_t>(
+    {{{{{1, 2}, valids}, {{3, 4}, valids}, {5}}, {{6}, {{7, 8, 9}, valids}}}, valids});
+  cudf::table_view expected_table_view({col});
+
+  nanoarrow::UniqueSchema input_schema;
+  ArrowSchemaInit(input_schema.get());
+  ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+
+  ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_LIST);
+  ArrowSchemaSetName(input_schema->children[0], "a");
+  input_schema->children[0]->flags = ARROW_FLAG_NULLABLE;
+
+  ArrowSchemaInitFromType(input_schema->children[0]->children[0], NANOARROW_TYPE_LIST);
+  ArrowSchemaSetName(input_schema->children[0]->children[0], "element");
+  input_schema->children[0]->children[0]->flags = 0;
+
+  ArrowSchemaInitFromType(input_schema->children[0]->children[0]->children[0],
+                          NANOARROW_TYPE_INT64);
+  ArrowSchemaSetName(input_schema->children[0]->children[0]->children[0], "element");
+  input_schema->children[0]->children[0]->children[0]->flags = ARROW_FLAG_NULLABLE;
+
+  nanoarrow::UniqueArray input_array;
+  EXPECT_EQ(NANOARROW_OK, ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
+  input_array->length = expected_table_view.num_rows();
+  auto top_list       = input_array->children[0];
+  cudf::lists_column_view lview{expected_table_view.column(0)};
+  populate_list_from_col(top_list, lview);
+  cudf::lists_column_view nested_view{lview.child()};
+  populate_list_from_col(top_list->children[0], nested_view);
+  populate_from_col<int64_t>(top_list->children[0]->children[0], nested_view.child());
+  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+
+  ArrowDeviceArray input_device_array;
+  input_device_array.device_id   = rmm::get_current_cuda_device().value();
+  input_device_array.device_type = ARROW_DEVICE_CUDA;
+  input_device_array.sync_event  = nullptr;
+  memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+  auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, *got_cudf_table_view);
+
+  auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+  EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+  cudf::table_view from_struct{
+    std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+}
+
+TEST_F(FromArrowDeviceTest, StructColumn)
+{
+  using vector_of_columns = std::vector<std::unique_ptr<cudf::column>>;
+
+  // Create cudf table
+  auto nested_type_field_names =
+    std::vector<std::vector<std::string>>{{"string", "integral", "bool", "nested_list", "struct"}};
+  auto str_col =
+    cudf::test::strings_column_wrapper{
+      "Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"}
+      .release();
+  auto str_col2 =
+    cudf::test::strings_column_wrapper{{"CUDF", "ROCKS", "EVERYWHERE"}, {0, 1, 0}}.release();
+  int num_rows{str_col->size()};
+  auto int_col = cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{48, 27, 25}}.release();
+  auto int_col2 =
+    cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{12, 24, 47}, {1, 0, 1}}.release();
+  auto bool_col = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}}.release();
+  auto list_col =
+    cudf::test::lists_column_wrapper<int64_t>({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})
+      .release();
+  vector_of_columns cols2;
+  cols2.push_back(std::move(str_col2));
+  cols2.push_back(std::move(int_col2));
+  auto [null_mask, null_count] =
+    cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}});
+  auto sub_struct_col =
+    cudf::make_structs_column(num_rows, std::move(cols2), null_count, std::move(*null_mask));
+  vector_of_columns cols;
+  cols.push_back(std::move(str_col));
+  cols.push_back(std::move(int_col));
+  cols.push_back(std::move(bool_col));
+  cols.push_back(std::move(list_col));
+  cols.push_back(std::move(sub_struct_col));
+
+  auto struct_col = cudf::make_structs_column(num_rows, std::move(cols), 0, {});
+  cudf::table_view expected_table_view({struct_col->view()});
+
+  // Create name metadata
+  auto sub_metadata          = cudf::column_metadata{"struct"};
+  sub_metadata.children_meta = {{"string2"}, {"integral2"}};
+  auto metadata              = cudf::column_metadata{"a"};
+  metadata.children_meta     = {{"string"}, {"integral"}, {"bool"}, {"nested_list"}, sub_metadata};
+
+  nanoarrow::UniqueSchema input_schema;
+  ArrowSchemaInit(input_schema.get());
+  ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+
+  ArrowSchemaInit(input_schema->children[0]);
+  ArrowSchemaSetTypeStruct(input_schema->children[0], 5);
+  ArrowSchemaSetName(input_schema->children[0], "a");
+  input_schema->children[0]->flags = 0;
+
+  auto child = input_schema->children[0];
+  ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING);
+  ArrowSchemaSetName(child->children[0], "string");
+  child->children[0]->flags = 0;
+
+  ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32);
+  ArrowSchemaSetName(child->children[1], "integral");
+  child->children[1]->flags = 0;
+
+  ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL);
+  ArrowSchemaSetName(child->children[2], "bool");
+  child->children[2]->flags = 0;
+
+  ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST);
+  ArrowSchemaSetName(child->children[3], "nested_list");
+  child->children[3]->flags = 0;
+  ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST);
+  ArrowSchemaSetName(child->children[3]->children[0], "element");
+  child->children[3]->children[0]->flags = 0;
+  ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64);
+  ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element");
+  child->children[3]->children[0]->children[0]->flags = 0;
+
+  ArrowSchemaInit(child->children[4]);
+  ArrowSchemaSetTypeStruct(child->children[4], 2);
+  ArrowSchemaSetName(child->children[4], "struct");
+
+  ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING);
+  ArrowSchemaSetName(child->children[4]->children[0], "string2");
+  ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32);
+  ArrowSchemaSetName(child->children[4]->children[1], "integral2");
+
+  nanoarrow::UniqueArray input_array;
+  ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+
+  input_array->length = expected_table_view.num_rows();
+
+  auto array_a        = input_array->children[0];
+  auto view_a         = expected_table_view.column(0);
+  array_a->length     = view_a.size();
+  array_a->null_count = view_a.null_count();
+
+  ArrowBufferSetAllocator(ArrowArrayBuffer(array_a, 0), noop_alloc);
+  ArrowArrayValidityBitmap(array_a)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_a.null_mask()));
+
+  populate_from_col<cudf::string_view>(array_a->children[0], view_a.child(0));
+  populate_from_col<int32_t>(array_a->children[1], view_a.child(1));
+  populate_from_col<bool>(array_a->children[2], view_a.child(2));
+  populate_list_from_col(array_a->children[3], cudf::lists_column_view{view_a.child(3)});
+  populate_list_from_col(array_a->children[3]->children[0],
+                         cudf::lists_column_view{view_a.child(3).child(1)});
+  populate_from_col<int64_t>(array_a->children[3]->children[0]->children[0],
+                             view_a.child(3).child(1).child(1));
+
+  auto array_struct        = array_a->children[4];
+  auto view_struct         = view_a.child(4);
+  array_struct->length     = view_struct.size();
+  array_struct->null_count = view_struct.null_count();
+
+  ArrowBufferSetAllocator(ArrowArrayBuffer(array_struct, 0), noop_alloc);
+  ArrowArrayValidityBitmap(array_struct)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_struct.null_mask()));
+
+  populate_from_col<cudf::string_view>(array_struct->children[0], view_struct.child(0));
+  populate_from_col<int32_t>(array_struct->children[1], view_struct.child(1));
+
+  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+
+  ArrowDeviceArray input_device_array;
+  input_device_array.device_id   = rmm::get_current_cuda_device().value();
+  input_device_array.device_type = ARROW_DEVICE_CUDA;
+  input_device_array.sync_event  = nullptr;
+  memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+  auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, *got_cudf_table_view);
+
+  {
+    // there's one boolean column so we should have one "owned_mem" column in the
+    // returned unique_ptr's custom deleter
+    const cudf::custom_view_deleter<cudf::table_view>& deleter = got_cudf_table_view.get_deleter();
+    EXPECT_EQ(deleter.owned_mem_.size(), 1);
+  }
+
+  auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+  EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+  cudf::table_view from_struct{
+    std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+
+  {
+    // there's one boolean column so we should have one "owned_mem" column in the
+    // returned unique_ptr's custom deleter
+    const cudf::custom_view_deleter<cudf::column_view>& deleter = got_cudf_col.get_deleter();
+    EXPECT_EQ(deleter.owned_mem_.size(), 1);
+  }
+}
+
+TEST_F(FromArrowDeviceTest, DictionaryIndicesType)
+{
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  auto col = cudf::test::fixed_width_column_wrapper<int64_t>({1, 2, 5, 2, 7}, {1, 0, 1, 1, 1});
+  columns.emplace_back(std::move(cudf::dictionary::encode(col)));
+  columns.emplace_back(std::move(cudf::dictionary::encode(col)));
+  columns.emplace_back(std::move(cudf::dictionary::encode(col)));
+
+  cudf::table expected_table(std::move(columns));
+  cudf::table_view expected_table_view = expected_table.view();
+
+  nanoarrow::UniqueSchema input_schema;
+  ArrowSchemaInit(input_schema.get());
+  ArrowSchemaSetTypeStruct(input_schema.get(), 3);
+
+  ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_INT8);
+  ArrowSchemaSetName(input_schema->children[0], "a");
+  ArrowSchemaAllocateDictionary(input_schema->children[0]);
+  ArrowSchemaInitFromType(input_schema->children[0]->dictionary, NANOARROW_TYPE_INT64);
+
+  ArrowSchemaInitFromType(input_schema->children[1], NANOARROW_TYPE_INT16);
+  ArrowSchemaSetName(input_schema->children[1], "b");
+  ArrowSchemaAllocateDictionary(input_schema->children[1]);
+  ArrowSchemaInitFromType(input_schema->children[1]->dictionary, NANOARROW_TYPE_INT64);
+
+  ArrowSchemaInitFromType(input_schema->children[2], NANOARROW_TYPE_INT64);
+  ArrowSchemaSetName(input_schema->children[2], "c");
+  ArrowSchemaAllocateDictionary(input_schema->children[2]);
+  ArrowSchemaInitFromType(input_schema->children[2]->dictionary, NANOARROW_TYPE_INT64);
+
+  nanoarrow::UniqueArray input_array;
+  ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+  input_array->length     = expected_table.num_rows();
+  input_array->null_count = 0;
+
+  auto col1_indices =
+    cudf::test::fixed_width_column_wrapper<int8_t>({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+  populate_from_col<int8_t>(input_array->children[0], col1_indices);
+  populate_from_col<int64_t>(input_array->children[0]->dictionary,
+                             cudf::dictionary_column_view{expected_table_view.column(0)}.keys());
+
+  auto col2_indices =
+    cudf::test::fixed_width_column_wrapper<int16_t>({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+  populate_from_col<int16_t>(input_array->children[1], col2_indices);
+  populate_from_col<int64_t>(input_array->children[1]->dictionary,
+                             cudf::dictionary_column_view{expected_table_view.column(1)}.keys());
+
+  auto col3_indices =
+    cudf::test::fixed_width_column_wrapper<int64_t>({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+  populate_from_col<int64_t>(input_array->children[2], col3_indices);
+  populate_from_col<int64_t>(input_array->children[2]->dictionary,
+                             cudf::dictionary_column_view{expected_table_view.column(2)}.keys());
+
+  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+
+  ArrowDeviceArray input_device_array;
+  input_device_array.device_id   = rmm::get_current_cuda_device().value();
+  input_device_array.device_type = ARROW_DEVICE_CUDA;
+  input_device_array.sync_event  = nullptr;
+  memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+  auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, *got_cudf_table_view);
+
+  {
+    const cudf::custom_view_deleter<cudf::table_view>& deleter = got_cudf_table_view.get_deleter();
+    EXPECT_EQ(deleter.owned_mem_.size(), 0);
+  }
+
+  auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+  EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+  cudf::table_view from_struct{
+    std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+
+  {
+    const cudf::custom_view_deleter<cudf::column_view>& deleter = got_cudf_col.get_deleter();
+    EXPECT_EQ(deleter.owned_mem_.size(), 0);
+  }
+}
+
+void slice_nanoarrow(ArrowArray* arr, int64_t start, int64_t end)
+{
+  auto op = [&](ArrowArray* array) {
+    array->offset = start;
+    array->length = end - start;
+    if (array->null_count != 0) {
+      array->null_count =
+        cudf::null_count(reinterpret_cast<cudf::bitmask_type const*>(array->buffers[0]),
+                         start,
+                         end,
+                         cudf::get_default_stream());
+    }
+  };
+
+  if (arr->n_children == 0) {
+    op(arr);
+    return;
+  }
+
+  arr->length = end - start;
+  for (int64_t i = 0; i < arr->n_children; ++i) {
+    op(arr->children[i]);
+  }
+}
+
+struct FromArrowDeviceTestSlice
+  : public FromArrowDeviceTest,
+    public ::testing::WithParamInterface<std::tuple<cudf::size_type, cudf::size_type>> {};
+
+TEST_P(FromArrowDeviceTestSlice, SliceTest)
+{
+  auto [table, schema, array] = get_nanoarrow_tables(10000);
+  auto cudf_table_view        = table->view();
+  auto const [start, end]     = GetParam();
+
+  auto sliced_cudf_table = cudf::slice(cudf_table_view, {start, end})[0];
+  slice_nanoarrow(array.get(), start, end);
+
+  ArrowDeviceArray input_device_array;
+  input_device_array.device_id   = rmm::get_current_cuda_device().value();
+  input_device_array.device_type = ARROW_DEVICE_CUDA;
+  input_device_array.sync_event  = nullptr;
+  memcpy(&input_device_array.array, array.get(), sizeof(ArrowArray));
+
+  auto got_cudf_table_view = cudf::from_arrow_device(schema.get(), &input_device_array);
+  if (got_cudf_table_view->num_rows() == 0 and sliced_cudf_table.num_rows() == 0) {
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(sliced_cudf_table, *got_cudf_table_view);
+
+    auto got_cudf_col = cudf::from_arrow_device_column(schema.get(), &input_device_array);
+    EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+    cudf::table_view from_struct{
+      std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*got_cudf_table_view, from_struct);
+
+  } else {
+    CUDF_TEST_EXPECT_TABLES_EQUAL(sliced_cudf_table, *got_cudf_table_view);
+
+    auto got_cudf_col = cudf::from_arrow_device_column(schema.get(), &input_device_array);
+    EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+    cudf::table_view from_struct{
+      std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(FromArrowDeviceTest,
+                        FromArrowDeviceTestSlice,
+                        ::testing::Values(std::make_tuple(0, 10000),
+                                          std::make_tuple(2912, 2915),
+                                          std::make_tuple(100, 3000),
+                                          std::make_tuple(0, 0),
+                                          std::make_tuple(0, 3000),
+                                          std::make_tuple(10000, 10000)));
+
+template <typename T>
+using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
+
+TEST_F(FromArrowDeviceTest, FixedPoint128Table)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data     = std::vector<__int128_t>{1, 2, 3, 4, 5, 6};
+    auto const col      = fp_wrapper<__int128_t>(data.cbegin(), data.cend(), scale_type{scale});
+    auto const expected = cudf::table_view({col});
+
+    nanoarrow::UniqueSchema input_schema;
+    ArrowSchemaInit(input_schema.get());
+    ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+    ArrowSchemaInit(input_schema->children[0]);
+    ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                              NANOARROW_TYPE_DECIMAL128,
+                              cudf::detail::max_precision<__int128_t>(),
+                              -scale);
+    ArrowSchemaSetName(input_schema->children[0], "a");
+
+    nanoarrow::UniqueArray input_array;
+    ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+    input_array->length = expected.num_rows();
+
+    populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+
+    ArrowDeviceArray input_device_array;
+    input_device_array.device_id   = rmm::get_current_cuda_device().value();
+    input_device_array.device_type = ARROW_DEVICE_CUDA;
+    input_device_array.sync_event  = nullptr;
+    memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+    auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *got_cudf_table_view);
+
+    auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+    EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+    cudf::table_view from_struct{
+      std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+  }
+}
+
+TEST_F(FromArrowDeviceTest, FixedPoint128TableLarge)
+{
+  using namespace numeric;
+  auto constexpr NUM_ELEMENTS = 1000;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto iota           = thrust::make_counting_iterator(1);
+    auto const data     = std::vector<__int128_t>(iota, iota + NUM_ELEMENTS);
+    auto const col      = fp_wrapper<__int128_t>(iota, iota + NUM_ELEMENTS, scale_type{scale});
+    auto const expected = cudf::table_view({col});
+
+    nanoarrow::UniqueSchema input_schema;
+    ArrowSchemaInit(input_schema.get());
+    ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+    ArrowSchemaInit(input_schema->children[0]);
+    ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                              NANOARROW_TYPE_DECIMAL128,
+                              cudf::detail::max_precision<__int128_t>(),
+                              -scale);
+    ArrowSchemaSetName(input_schema->children[0], "a");
+
+    nanoarrow::UniqueArray input_array;
+    ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+    input_array->length = expected.num_rows();
+
+    populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+
+    ArrowDeviceArray input_device_array;
+    input_device_array.device_id   = rmm::get_current_cuda_device().value();
+    input_device_array.device_type = ARROW_DEVICE_CUDA;
+    input_device_array.sync_event  = nullptr;
+    memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+    auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *got_cudf_table_view);
+
+    auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+    EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+    cudf::table_view from_struct{
+      std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+  }
+}
+
+TEST_F(FromArrowDeviceTest, FixedPoint128TableNulls)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data     = std::vector<__int128_t>{1, 2, 3, 4, 5, 6, 0, 0};
+    auto const validity = std::vector<int32_t>{1, 1, 1, 1, 1, 1, 0, 0};
+    auto const col =
+      fp_wrapper<__int128_t>({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{scale});
+    auto const expected = cudf::table_view({col});
+
+    nanoarrow::UniqueSchema input_schema;
+    ArrowSchemaInit(input_schema.get());
+    ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+    ArrowSchemaInit(input_schema->children[0]);
+    ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                              NANOARROW_TYPE_DECIMAL128,
+                              cudf::detail::max_precision<__int128_t>(),
+                              -scale);
+    ArrowSchemaSetName(input_schema->children[0], "a");
+
+    nanoarrow::UniqueArray input_array;
+    ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+    input_array->length = expected.num_rows();
+
+    populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+
+    ArrowDeviceArray input_device_array;
+    input_device_array.device_id   = rmm::get_current_cuda_device().value();
+    input_device_array.device_type = ARROW_DEVICE_CUDA;
+    input_device_array.sync_event  = nullptr;
+    memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+    auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *got_cudf_table_view);
+
+    auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+    EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+    cudf::table_view from_struct{
+      std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+  }
+}
+
+TEST_F(FromArrowDeviceTest, FixedPoint128TableNullsLarge)
+{
+  using namespace numeric;
+  auto constexpr NUM_ELEMENTS = 1000;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto every_other = [](auto i) { return i % 2 ? 0 : 1; };
+    auto validity    = cudf::detail::make_counting_transform_iterator(0, every_other);
+    auto iota        = thrust::make_counting_iterator(1);
+    auto const data  = std::vector<__int128_t>(iota, iota + NUM_ELEMENTS);
+    auto const col = fp_wrapper<__int128_t>(iota, iota + NUM_ELEMENTS, validity, scale_type{scale});
+    auto const expected = cudf::table_view({col});
+
+    nanoarrow::UniqueSchema input_schema;
+    ArrowSchemaInit(input_schema.get());
+    ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+    ArrowSchemaInit(input_schema->children[0]);
+    ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                              NANOARROW_TYPE_DECIMAL128,
+                              cudf::detail::max_precision<__int128_t>(),
+                              -scale);
+    ArrowSchemaSetName(input_schema->children[0], "a");
+
+    nanoarrow::UniqueArray input_array;
+    ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+    input_array->length = expected.num_rows();
+
+    populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+
+    ArrowDeviceArray input_device_array;
+    input_device_array.device_id   = rmm::get_current_cuda_device().value();
+    input_device_array.device_type = ARROW_DEVICE_CUDA;
+    input_device_array.sync_event  = nullptr;
+    memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+    auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *got_cudf_table_view);
+
+    auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+    EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+    cudf::table_view from_struct{
+      std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+  }
+}
diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp
index c4b53282402..b795bafed97 100644
--- a/cpp/tests/interop/nanoarrow_utils.hpp
+++ b/cpp/tests/interop/nanoarrow_utils.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/interop/detail/arrow.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -25,6 +26,8 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
+#include <nanoarrow/nanoarrow.hpp>
+
 // no-op allocator/deallocator to set into ArrowArray buffers that we don't
 // want to own their buffers.
 static ArrowBufferAllocator noop_alloc = (struct ArrowBufferAllocator){
@@ -35,28 +38,6 @@ static ArrowBufferAllocator noop_alloc = (struct ArrowBufferAllocator){
   .private_data = nullptr,
 };
 
-// populate the ArrowArray by copying host data buffers for fixed width types other
-// than boolean.
-template <typename T>
-std::enable_if_t<cudf::is_fixed_width<T>() and !std::is_same_v<T, bool>, void> get_nanoarrow_array(
-  ArrowArray* arr, std::vector<T> const& data, std::vector<uint8_t> const& mask = {})
-{
-  arr->length = data.size();
-  NANOARROW_THROW_NOT_OK(
-    ArrowBufferAppend(ArrowArrayBuffer(arr, 1), data.data(), sizeof(T) * data.size()));
-  if (!mask.empty()) {
-    NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arr), mask.size()));
-    ArrowBitmapAppendInt8Unsafe(
-      ArrowArrayValidityBitmap(arr), reinterpret_cast<const int8_t*>(mask.data()), mask.size());
-    arr->null_count = ArrowBitCountSet(ArrowArrayValidityBitmap(arr)->buffer.data, 0, data.size());
-  } else {
-    arr->null_count = 0;
-  }
-
-  CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arr, nullptr) == NANOARROW_OK,
-               "failed to construct array");
-}
-
 // populate an ArrowArray with pointers to the raw device buffers of a cudf::column_view
 // and use the no-op alloc so that the ArrowArray doesn't presume ownership of the data
 template <typename T>
@@ -66,38 +47,13 @@ std::enable_if_t<cudf::is_fixed_width<T>() and !std::is_same_v<T, bool>, void> p
   arr->length     = view.size();
   arr->null_count = view.null_count();
   NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
+  ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
+    cudf::bitmask_allocation_size_bytes(view.size());
   ArrowArrayValidityBitmap(arr)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
   NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
-  ArrowArrayBuffer(arr, 1)->data = const_cast<uint8_t*>(view.data<uint8_t>());
-}
-
-// populate an ArrowArray with boolean data by generating the appropriate
-// bitmaps to copy the data.
-template <typename T>
-std::enable_if_t<std::is_same_v<T, bool>, void> get_nanoarrow_array(
-  ArrowArray* arr, std::vector<bool> const& data, std::vector<bool> const& mask = {})
-{
-  ArrowBitmap bool_data;
-  ArrowBitmapInit(&bool_data);
-  NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&bool_data, data.size()));
-  std::for_each(data.begin(), data.end(), [&](const auto&& elem) {
-    NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&bool_data, (elem) ? 1 : 0, 1));
-  });
-  NANOARROW_THROW_NOT_OK(ArrowArraySetBuffer(arr, 1, &bool_data.buffer));
-
-  if (!mask.empty()) {
-    NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arr), mask.size()));
-    std::for_each(mask.begin(), mask.end(), [&](const auto&& elem) {
-      NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(arr), (elem) ? 1 : 0, 1));
-    });
-    arr->null_count = ArrowBitCountSet(ArrowArrayValidityBitmap(arr)->buffer.data, 0, data.size());
-  } else {
-    arr->null_count = 0;
-  }
-
-  CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arr, nullptr) == NANOARROW_OK,
-               "failed to construct boolean array");
+  ArrowArrayBuffer(arr, 1)->size_bytes = sizeof(T) * view.size();
+  ArrowArrayBuffer(arr, 1)->data       = const_cast<uint8_t*>(view.data<uint8_t>());
 }
 
 // populate an ArrowArray from a boolean cudf column. Since Arrow and cudf
@@ -109,7 +65,10 @@ std::enable_if_t<std::is_same_v<T, bool>, void> populate_from_col(ArrowArray* ar
 {
   arr->length     = view.size();
   arr->null_count = view.null_count();
+
   NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
+  ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
+    cudf::bitmask_allocation_size_bytes(view.size());
   ArrowArrayValidityBitmap(arr)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
 
@@ -123,32 +82,8 @@ std::enable_if_t<std::is_same_v<T, bool>, void> populate_from_col(ArrowArray* ar
         delete buf;
       },
       new std::unique_ptr<rmm::device_buffer>(std::move(bitmask.first)))));
-  ArrowArrayBuffer(arr, 1)->data = ptr;
-}
-
-// populate an ArrowArray by copying the string data and constructing the offsets
-// buffer.
-template <typename T>
-std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> get_nanoarrow_array(
-  ArrowArray* arr, std::vector<std::string> const& data, std::vector<uint8_t> const& mask = {})
-{
-  NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(arr));
-  for (auto& str : data) {
-    NANOARROW_THROW_NOT_OK(ArrowArrayAppendString(arr, ArrowCharView(str.c_str())));
-  }
-
-  if (!mask.empty()) {
-    ArrowBitmapReset(ArrowArrayValidityBitmap(arr));
-    NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arr), mask.size()));
-    ArrowBitmapAppendInt8Unsafe(
-      ArrowArrayValidityBitmap(arr), reinterpret_cast<const int8_t*>(mask.data()), mask.size());
-    arr->null_count = ArrowBitCountSet(ArrowArrayValidityBitmap(arr)->buffer.data, 0, data.size());
-  } else {
-    arr->null_count = 0;
-  }
-
-  CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arr, nullptr) == NANOARROW_OK,
-               "failed to construct string array");
+  ArrowArrayBuffer(arr, 1)->size_bytes = cudf::bitmask_allocation_size_bytes(view.size());
+  ArrowArrayBuffer(arr, 1)->data       = ptr;
 }
 
 // populate an ArrowArray with the string data buffers of a cudf column_view
@@ -160,67 +95,47 @@ std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> populate_from_col(
 {
   arr->length     = view.size();
   arr->null_count = view.null_count();
+
   NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
+  ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
+    cudf::bitmask_allocation_size_bytes(view.size());
   ArrowArrayValidityBitmap(arr)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
 
   cudf::strings_column_view sview{view};
-  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
-  ArrowArrayBuffer(arr, 1)->data = const_cast<uint8_t*>(sview.offsets().data<uint8_t>());
-  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 2), noop_alloc));
-  ArrowArrayBuffer(arr, 2)->data = const_cast<uint8_t*>(view.data<uint8_t>());
+  if (view.size() > 0) {
+    NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
+    ArrowArrayBuffer(arr, 1)->size_bytes = sizeof(int32_t) * sview.offsets().size();
+    ArrowArrayBuffer(arr, 1)->data       = const_cast<uint8_t*>(sview.offsets().data<uint8_t>());
+    NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 2), noop_alloc));
+    ArrowArrayBuffer(arr, 2)->size_bytes = sview.chars_size(cudf::get_default_stream());
+    ArrowArrayBuffer(arr, 2)->data       = const_cast<uint8_t*>(view.data<uint8_t>());
+  } else {
+    auto zero          = rmm::device_scalar<int32_t>(0, cudf::get_default_stream());
+    const uint8_t* ptr = reinterpret_cast<uint8_t*>(zero.data());
+    nanoarrow::BufferInitWrapped(ArrowArrayBuffer(arr, 1), std::move(zero), ptr, 4);
+  }
 }
 
-// populate a dictionary ArrowArray by delegating the copying of the indices
-// and key arrays
 template <typename KEY_TYPE, typename IND_TYPE>
-void get_nanoarrow_dict_array(ArrowArray* arr,
-                              std::vector<KEY_TYPE> const& keys,
-                              std::vector<IND_TYPE> const& ind,
-                              std::vector<uint8_t> const& validity = {})
+void populate_dict_from_col(ArrowArray* arr, cudf::dictionary_column_view dview)
 {
-  get_nanoarrow_array<KEY_TYPE>(arr->dictionary, keys);
-  get_nanoarrow_array<IND_TYPE>(arr, ind, validity);
-}
+  arr->length     = dview.size();
+  arr->null_count = dview.null_count();
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
+    cudf::bitmask_allocation_size_bytes(dview.size());
+  ArrowArrayValidityBitmap(arr)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(dview.null_mask()));
 
-// populate a list ArrowArray by copying the offsets and data buffers
-template <typename T>
-void get_nanoarrow_list_array(ArrowArray* arr,
-                              std::vector<T> data,
-                              std::vector<int32_t> offsets,
-                              std::vector<uint8_t> data_validity = {},
-                              std::vector<uint8_t> list_validity = {})
-{
-  get_nanoarrow_array<T>(arr->children[0], data, data_validity);
-
-  arr->length = offsets.size() - 1;
-  NANOARROW_THROW_NOT_OK(
-    ArrowBufferAppend(ArrowArrayBuffer(arr, 1), offsets.data(), sizeof(int32_t) * offsets.size()));
-  if (!list_validity.empty()) {
-    NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arr), list_validity.size()));
-    ArrowBitmapAppendInt8Unsafe(ArrowArrayValidityBitmap(arr),
-                                reinterpret_cast<const int8_t*>(list_validity.data()),
-                                arr->length);
-    arr->null_count = ArrowBitCountSet(ArrowArrayValidityBitmap(arr)->buffer.data, 0, arr->length);
-  } else {
-    arr->null_count = 0;
-  }
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
+  ArrowArrayBuffer(arr, 1)->size_bytes = sizeof(IND_TYPE) * dview.indices().size();
+  ArrowArrayBuffer(arr, 1)->data       = const_cast<uint8_t*>(dview.indices().data<uint8_t>());
 
-  CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arr, nullptr) == NANOARROW_OK,
-               "failed to construct list array");
+  populate_from_col<KEY_TYPE>(arr->dictionary, dview.keys());
 }
 
-// populate an ArrowArray list array from device buffers using a no-op
-// allocator so that the ArrowArray doesn't have ownership of the buffers
-void populate_list_from_col(ArrowArray* arr, cudf::lists_column_view view)
-{
-  arr->length     = view.size();
-  arr->null_count = view.null_count();
+std::tuple<std::unique_ptr<cudf::table>, nanoarrow::UniqueSchema, nanoarrow::UniqueArray>
+get_nanoarrow_tables(cudf::size_type length = 10000);
 
-  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
-  ArrowArrayValidityBitmap(arr)->buffer.data =
-    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
-
-  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
-  ArrowArrayBuffer(arr, 1)->data = const_cast<uint8_t*>(view.offsets().data<uint8_t>());
-}
+void populate_list_from_col(ArrowArray* arr, cudf::lists_column_view view);
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index d6eae8dece1..fb346dad538 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -57,6 +57,26 @@ get_nanoarrow_tables(cudf::size_type length)
 
   std::vector<std::unique_ptr<cudf::column>> columns;
 
+  std::generate(int64_data.begin(), int64_data.end(), []() { return rand() % 500000; });
+  std::generate(list_int64_data.begin(), list_int64_data.end(), []() { return rand() % 500000; });
+  auto validity_generator = []() { return rand() % 7 != 0; };
+  std::generate(
+    list_int64_data_validity.begin(), list_int64_data_validity.end(), validity_generator);
+  std::generate(
+    list_offsets.begin(), list_offsets.end(), [length_of_individual_list, n = 0]() mutable {
+      return (n++) * length_of_individual_list;
+    });
+  std::generate(bool_data.begin(), bool_data.end(), validity_generator);
+  std::generate(
+    string_data.begin(), string_data.end(), []() { return rand() % 7 != 0 ? "CUDF" : "Rocks"; });
+  std::generate(validity.begin(), validity.end(), validity_generator);
+  std::generate(bool_validity.begin(), bool_validity.end(), validity_generator);
+
+  std::transform(bool_validity.cbegin(),
+                 bool_validity.cend(),
+                 std::back_inserter(bool_data_validity),
+                 [](auto val) { return static_cast<uint8_t>(val); });
+
   columns.emplace_back(cudf::test::fixed_width_column_wrapper<int64_t>(
                          int64_data.begin(), int64_data.end(), validity.begin())
                          .release());
@@ -180,41 +200,58 @@ get_nanoarrow_tables(cudf::size_type length)
 
   nanoarrow::UniqueArray arrow;
   NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(arrow.get(), schema.get(), nullptr));
-
-  get_nanoarrow_array<int64_t>(arrow->children[0], int64_data, validity);
-  get_nanoarrow_array<cudf::string_view>(arrow->children[1], string_data, validity);
-  cudf::dictionary_column_view view(dict_col->view());
-  auto keys    = cudf::test::to_host<int64_t>(view.keys()).first;
-  auto indices = cudf::test::to_host<uint32_t>(view.indices()).first;
-  get_nanoarrow_dict_array(arrow->children[2],
-                           std::vector<int64_t>(keys.begin(), keys.end()),
-                           std::vector<int32_t>(indices.begin(), indices.end()),
-                           validity);
-  get_nanoarrow_array<bool>(arrow->children[3], bool_data, bool_validity);
-  get_nanoarrow_list_array<int64_t>(arrow->children[4],
-                                    list_int64_data,
-                                    list_offsets,
-                                    list_int64_data_validity,
-                                    bool_data_validity);
-
-  get_nanoarrow_array<int64_t>(arrow->children[5]->children[0], int64_data, validity);
-  get_nanoarrow_array<cudf::string_view>(arrow->children[5]->children[1], string_data, validity);
-  arrow->children[5]->length = length;
-  NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arrow->children[5]), length));
-  std::for_each(bool_data_validity.begin(), bool_data_validity.end(), [&](auto&& elem) {
-    NANOARROW_THROW_NOT_OK(
-      ArrowBitmapAppend(ArrowArrayValidityBitmap(arrow->children[5]), (elem) ? 1 : 0, 1));
-  });
-  arrow->children[5]->null_count =
-    ArrowBitCountSet(ArrowArrayValidityBitmap(arrow->children[5])->buffer.data, 0, length);
-
-  CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arrow.get(), nullptr) == NANOARROW_OK,
-               "failed to build example Arrays");
+  arrow->length = length;
+
+  populate_from_col<int64_t>(arrow->children[0], columns[0]->view());
+  populate_from_col<cudf::string_view>(arrow->children[1], columns[1]->view());
+  populate_dict_from_col<int64_t, uint32_t>(arrow->children[2],
+                                            cudf::dictionary_column_view(columns[2]->view()));
+
+  populate_from_col<bool>(arrow->children[3], columns[3]->view());
+  cudf::lists_column_view list_view{columns[4]->view()};
+  populate_list_from_col(arrow->children[4], list_view);
+  populate_from_col<int64_t>(arrow->children[4]->children[0], list_view.child());
+
+  cudf::structs_column_view struct_view{columns[5]->view()};
+  populate_from_col<int64_t>(arrow->children[5]->children[0], struct_view.child(0));
+  populate_from_col<cudf::string_view>(arrow->children[5]->children[1], struct_view.child(1));
+  arrow->children[5]->length     = struct_view.size();
+  arrow->children[5]->null_count = struct_view.null_count();
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arrow->children[5], 0), noop_alloc);
+  ArrowArrayValidityBitmap(arrow->children[5])->buffer.size_bytes =
+    cudf::bitmask_allocation_size_bytes(struct_view.size());
+  ArrowArrayValidityBitmap(arrow->children[5])->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(struct_view.null_mask()));
+
+  ArrowError error;
+  if (ArrowArrayFinishBuilding(arrow.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, &error) !=
+      NANOARROW_OK) {
+    std::cerr << ArrowErrorMessage(&error) << std::endl;
+    CUDF_FAIL("failed to build example arrays");
+  }
 
   return std::make_tuple(
     std::make_unique<cudf::table>(std::move(columns)), std::move(schema), std::move(arrow));
 }
 
+// populate an ArrowArray list array from device buffers using a no-op
+// allocator so that the ArrowArray doesn't have ownership of the buffers
+void populate_list_from_col(ArrowArray* arr, cudf::lists_column_view view)
+{
+  arr->length     = view.size();
+  arr->null_count = view.null_count();
+
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
+    cudf::bitmask_allocation_size_bytes(view.size());
+  ArrowArrayValidityBitmap(arr)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
+
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
+  ArrowArrayBuffer(arr, 1)->size_bytes = sizeof(int32_t) * view.offsets().size();
+  ArrowArrayBuffer(arr, 1)->data       = const_cast<uint8_t*>(view.offsets().data<uint8_t>());
+}
+
 struct BaseArrowFixture : public cudf::test::BaseFixture {
   void compare_schemas(const ArrowSchema* expected, const ArrowSchema* actual)
   {

From 117eff6bf1eb8a46c597fd8f9e76a22fa363f03a Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Wed, 24 Apr 2024 12:26:09 -0700
Subject: [PATCH 121/272] Add BYTE_STREAM_SPLIT support to Parquet (#15311)

Closes #15226. Part of #13501.  Adds support for reading and writing `BYTE_STREAM_SPLIT` encoded Parquet data. Includes a "microkernel" version like those introduced by #15159.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15311
---
 cpp/include/cudf/io/types.hpp            |   1 +
 cpp/src/io/parquet/decode_fixed.cu       | 229 ++++++++++++++++++++++-
 cpp/src/io/parquet/page_data.cu          | 198 +++++++++++++++++++-
 cpp/src/io/parquet/page_data.cuh         |  76 ++++++++
 cpp/src/io/parquet/page_decode.cuh       |   1 +
 cpp/src/io/parquet/page_delta_decode.cu  |   6 +-
 cpp/src/io/parquet/page_enc.cu           | 137 ++++++++------
 cpp/src/io/parquet/page_hdr.cu           |  26 ++-
 cpp/src/io/parquet/page_string_decode.cu |  16 +-
 cpp/src/io/parquet/parquet_gpu.hpp       |  84 +++++++--
 cpp/src/io/parquet/reader_impl.cpp       |  22 +++
 cpp/src/io/parquet/writer_impl.cu        |  18 +-
 cpp/tests/io/parquet_common.cpp          |   1 +
 cpp/tests/io/parquet_writer_test.cpp     | 159 +++++++++++++++-
 14 files changed, 876 insertions(+), 98 deletions(-)

diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 65d4a4417f0..b3dea0ab280 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -113,6 +113,7 @@ enum class column_encoding {
                             ///< valid for BYTE_ARRAY columns)
   DELTA_BYTE_ARRAY,         ///< Use DELTA_BYTE_ARRAY encoding (only valid for
                             ///< BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY columns)
+  BYTE_STREAM_SPLIT,        ///< Use BYTE_STREAM_SPLIT encoding (valid for all fixed width types)
   // ORC encodings:
   DIRECT,         ///< Use DIRECT encoding
   DIRECT_V2,      ///< Use DIRECT_V2 encoding
diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 945a7dcb4c6..f3332a23992 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -225,6 +225,96 @@ __device__ inline void gpuDecodeValues(
   }
 }
 
+template <typename state_buf>
+__device__ inline void gpuDecodeSplitValues(page_state_s* s,
+                                            state_buf* const sb,
+                                            int start,
+                                            int end)
+{
+  using cudf::detail::warp_size;
+  constexpr int num_warps      = decode_block_size / warp_size;
+  constexpr int max_batch_size = num_warps * warp_size;
+
+  auto const t = threadIdx.x;
+
+  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
+  int const dtype                          = s->col.physical_type;
+  auto const data_len                      = thrust::distance(s->data_start, s->data_end);
+  auto const num_values                    = data_len / s->dtype_len_in;
+
+  // decode values
+  int pos = start;
+  while (pos < end) {
+    int const batch_size = min(max_batch_size, end - pos);
+
+    int const target_pos = pos + batch_size;
+    int const src_pos    = pos + t;
+
+    // the position in the output column/buffer
+    int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] - s->first_row;
+
+    // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
+    // before first_row) in the flat hierarchy case.
+    if (src_pos < target_pos && dst_pos >= 0) {
+      // nesting level that is storing actual leaf values
+      int const leaf_level_index = s->col.max_nesting_depth - 1;
+
+      uint32_t dtype_len = s->dtype_len;
+      uint8_t const* src = s->data_start + src_pos;
+      uint8_t* dst =
+        nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
+      auto const is_decimal =
+        s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL;
+
+      // Note: non-decimal FIXED_LEN_BYTE_ARRAY will be handled in the string reader
+      if (is_decimal) {
+        switch (dtype) {
+          case INT32: gpuOutputByteStreamSplit<int32_t>(dst, src, num_values); break;
+          case INT64: gpuOutputByteStreamSplit<int64_t>(dst, src, num_values); break;
+          case FIXED_LEN_BYTE_ARRAY:
+            if (s->dtype_len_in <= sizeof(int32_t)) {
+              gpuOutputSplitFixedLenByteArrayAsInt(
+                reinterpret_cast<int32_t*>(dst), src, num_values, s->dtype_len_in);
+              break;
+            } else if (s->dtype_len_in <= sizeof(int64_t)) {
+              gpuOutputSplitFixedLenByteArrayAsInt(
+                reinterpret_cast<int64_t*>(dst), src, num_values, s->dtype_len_in);
+              break;
+            } else if (s->dtype_len_in <= sizeof(__int128_t)) {
+              gpuOutputSplitFixedLenByteArrayAsInt(
+                reinterpret_cast<__int128_t*>(dst), src, num_values, s->dtype_len_in);
+              break;
+            }
+            // unsupported decimal precision
+            [[fallthrough]];
+
+          default: s->set_error_code(decode_error::UNSUPPORTED_ENCODING);
+        }
+      } else if (dtype_len == 8) {
+        if (s->dtype_len_in == 4) {
+          // Reading INT32 TIME_MILLIS into 64-bit DURATION_MILLISECONDS
+          // TIME_MILLIS is the only duration type stored as int32:
+          // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#deprecated-time-convertedtype
+          gpuOutputByteStreamSplit<int32_t>(dst, src, num_values);
+          // zero out most significant bytes
+          memset(dst + 4, 0, 4);
+        } else if (s->ts_scale) {
+          gpuOutputSplitInt64Timestamp(
+            reinterpret_cast<int64_t*>(dst), src, num_values, s->ts_scale);
+        } else {
+          gpuOutputByteStreamSplit<int64_t>(dst, src, num_values);
+        }
+      } else if (dtype_len == 4) {
+        gpuOutputByteStreamSplit<int32_t>(dst, src, num_values);
+      } else {
+        s->set_error_code(decode_error::UNSUPPORTED_ENCODING);
+      }
+    }
+
+    pos += batch_size;
+  }
+}
+
 // is the page marked nullable or not
 __device__ inline bool is_nullable(page_state_s* s)
 {
@@ -495,6 +585,123 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
 
+/**
+ * @brief Kernel for computing fixed width non dictionary column data stored in the pages
+ *
+ * This function will write the page data and the page data's validity to the
+ * output specified in the page's column chunk. If necessary, additional
+ * conversion will be performed to translate from the Parquet datatype to
+ * desired output datatype.
+ *
+ * @param pages List of pages
+ * @param chunks List of column chunks
+ * @param min_row Row index to start reading at
+ * @param num_rows Maximum number of rows to read
+ * @param error_code Error code to set if an error is encountered
+ */
+template <typename level_t>
+CUDF_KERNEL void __launch_bounds__(decode_block_size)
+  gpuDecodeSplitPageDataFlat(PageInfo* pages,
+                             device_span<ColumnChunkDesc const> chunks,
+                             size_t min_row,
+                             size_t num_rows,
+                             kernel_error::pointer error_code)
+{
+  __shared__ __align__(16) page_state_s state_g;
+  __shared__ __align__(16) page_state_buffers_s<rolling_buf_size,  // size of nz_idx buffer
+                                                1,                 // unused in this kernel
+                                                1>                 // unused in this kernel
+    state_buffers;
+
+  page_state_s* const s = &state_g;
+  auto* const sb        = &state_buffers;
+  int const page_idx    = blockIdx.x;
+  int const t           = threadIdx.x;
+  PageInfo* pp          = &pages[page_idx];
+
+  if (!(BitAnd(pages[page_idx].kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT))) {
+    return;
+  }
+
+  // must come after the kernel mask check
+  [[maybe_unused]] null_count_back_copier _{s, t};
+
+  if (!setupLocalPageInfo(s,
+                          pp,
+                          chunks,
+                          min_row,
+                          num_rows,
+                          mask_filter{decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT},
+                          page_processing_stage::DECODE)) {
+    return;
+  }
+
+  // the level stream decoders
+  __shared__ rle_run<level_t> def_runs[rle_run_buffer_size];
+  rle_stream<level_t, decode_block_size, rolling_buf_size> def_decoder{def_runs};
+
+  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
+  if (s->num_rows == 0) { return; }
+
+  bool const nullable            = is_nullable(s);
+  bool const nullable_with_nulls = nullable && has_nulls(s);
+
+  // initialize the stream decoders (requires values computed in setupLocalPageInfo)
+  level_t* const def = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
+  if (nullable_with_nulls) {
+    def_decoder.init(s->col.level_bits[level_type::DEFINITION],
+                     s->abs_lvl_start[level_type::DEFINITION],
+                     s->abs_lvl_end[level_type::DEFINITION],
+                     def,
+                     s->page.num_input_values);
+  }
+  __syncthreads();
+
+  // We use two counters in the loop below: processed_count and valid_count.
+  // - processed_count: number of rows out of num_input_values that we have decoded so far.
+  //   the definition stream returns the number of total rows it has processed in each call
+  //   to decode_next and we accumulate in process_count.
+  // - valid_count: number of non-null rows we have decoded so far. In each iteration of the
+  //   loop below, we look at the number of valid items (which could be all for non-nullable),
+  //   and valid_count is that running count.
+  int processed_count = 0;
+  int valid_count     = 0;
+  // the core loop. decode batches of level stream data using rle_stream objects
+  // and pass the results to gpuDecodeValues
+  while (s->error == 0 && processed_count < s->page.num_input_values) {
+    int next_valid_count;
+
+    // only need to process definition levels if this is a nullable column
+    if (nullable) {
+      if (nullable_with_nulls) {
+        processed_count += def_decoder.decode_next(t);
+        __syncthreads();
+      } else {
+        processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
+      }
+
+      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(
+        processed_count, s, sb, def, t, nullable_with_nulls);
+    }
+    // if we wanted to split off the skip_rows/num_rows case into a separate kernel, we could skip
+    // this function call entirely since all it will ever generate is a mapping of (i -> i) for
+    // nz_idx.  gpuDecodeValues would be the only work that happens.
+    else {
+      processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
+      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<false, level_t>(
+        processed_count, s, sb, nullptr, t, false);
+    }
+    __syncthreads();
+
+    // decode the values themselves
+    gpuDecodeSplitValues(s, sb, valid_count, next_valid_count);
+    __syncthreads();
+
+    valid_count = next_valid_count;
+  }
+  if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
+}
+
 }  // anonymous namespace
 
 void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
@@ -528,7 +735,7 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
   //  dim3 dim_block(decode_block_size, 1); // decode_block_size = 128 threads per block
   // 1 full warp, and 1 warp of 1 thread
   dim3 dim_block(decode_block_size, 1);  // decode_block_size = 128 threads per block
-  dim3 dim_grid(pages.size(), 1);        // 1 thread block per pags => # blocks
+  dim3 dim_grid(pages.size(), 1);        // 1 thread block per page => # blocks
 
   if (level_type_size == 1) {
     gpuDecodePageDataFixedDict<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
@@ -539,4 +746,24 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
   }
 }
 
+void __host__ DecodeSplitPageDataFlat(cudf::detail::hostdevice_span<PageInfo> pages,
+                                      cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
+                                      size_t num_rows,
+                                      size_t min_row,
+                                      int level_type_size,
+                                      kernel_error::pointer error_code,
+                                      rmm::cuda_stream_view stream)
+{
+  dim3 dim_block(decode_block_size, 1);  // decode_block_size = 128 threads per block
+  dim3 dim_grid(pages.size(), 1);        // 1 thread block per page => # blocks
+
+  if (level_type_size == 1) {
+    gpuDecodeSplitPageDataFlat<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+  } else {
+    gpuDecodeSplitPageDataFlat<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+  }
+}
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 62ce5b9f9a5..7207173b82f 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -28,6 +28,177 @@ namespace {
 constexpr int decode_block_size = 128;
 constexpr int rolling_buf_size  = decode_block_size * 2;
 
+/**
+ * @brief Kernel for computing the BYTE_STREAM_SPLIT column data stored in the pages
+ *
+ * This is basically the PLAIN decoder, but with a pared down set of supported data
+ * types, and using output functions that piece together the individual streams.
+ * Supported physical types include INT32, INT64, FLOAT, DOUBLE and FIXED_LEN_BYTE_ARRAY.
+ * The latter is currently only used for large decimals. The Parquet specification also
+ * has FLOAT16 and UUID types that are currently not supported. FIXED_LEN_BYTE_ARRAY data
+ * that lacks a `LogicalType` annotation will be handled by the string decoder.
+ *
+ * @param pages List of pages
+ * @param chunks List of column chunks
+ * @param min_row Row index to start reading at
+ * @param num_rows Maximum number of rows to read
+ * @param error_code Error code to set if an error is encountered
+ */
+template <int lvl_buf_size, typename level_t>
+CUDF_KERNEL void __launch_bounds__(decode_block_size)
+  gpuDecodeSplitPageData(PageInfo* pages,
+                         device_span<ColumnChunkDesc const> chunks,
+                         size_t min_row,
+                         size_t num_rows,
+                         kernel_error::pointer error_code)
+{
+  using cudf::detail::warp_size;
+  __shared__ __align__(16) page_state_s state_g;
+  __shared__ __align__(16)
+    page_state_buffers_s<rolling_buf_size, rolling_buf_size, rolling_buf_size>
+      state_buffers;
+
+  page_state_s* const s = &state_g;
+  auto* const sb        = &state_buffers;
+  int page_idx          = blockIdx.x;
+  int t                 = threadIdx.x;
+  [[maybe_unused]] null_count_back_copier _{s, t};
+
+  if (!setupLocalPageInfo(s,
+                          &pages[page_idx],
+                          chunks,
+                          min_row,
+                          num_rows,
+                          mask_filter{decode_kernel_mask::BYTE_STREAM_SPLIT},
+                          page_processing_stage::DECODE)) {
+    return;
+  }
+
+  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+
+  auto const data_len    = thrust::distance(s->data_start, s->data_end);
+  auto const num_values  = data_len / s->dtype_len_in;
+  auto const out_thread0 = warp_size;
+
+  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
+
+  __shared__ level_t rep[rolling_buf_size];  // circular buffer of repetition level values
+  __shared__ level_t def[rolling_buf_size];  // circular buffer of definition level values
+
+  // skipped_leaf_values will always be 0 for flat hierarchies.
+  uint32_t skipped_leaf_values = s->page.skipped_leaf_values;
+  while (s->error == 0 &&
+         (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
+    int target_pos;
+    int src_pos = s->src_pos;
+
+    if (t < out_thread0) {
+      target_pos = min(src_pos + 2 * (decode_block_size - out_thread0),
+                       s->nz_count + (decode_block_size - out_thread0));
+    } else {
+      target_pos = min(s->nz_count, src_pos + decode_block_size - out_thread0);
+    }
+    // this needs to be here to prevent warp 1 modifying src_pos before all threads have read it
+    __syncthreads();
+
+    if (t < warp_size) {
+      // decode repetition and definition levels.
+      // - update validity vectors
+      // - updates offsets (for nested columns)
+      // - produces non-NULL value indices in s->nz_idx for subsequent decoding
+      gpuDecodeLevels<lvl_buf_size, level_t>(s, sb, target_pos, rep, def, t);
+    } else {
+      // WARP1..WARP3: Decode values
+      int const dtype = s->col.physical_type;
+      src_pos += t - out_thread0;
+
+      // the position in the output column/buffer
+      int dst_pos = sb->nz_idx[rolling_index<rolling_buf_size>(src_pos)];
+
+      // for the flat hierarchy case we will be reading from the beginning of the value stream,
+      // regardless of the value of first_row. so adjust our destination offset accordingly.
+      // example:
+      // - user has passed skip_rows = 2, so our first_row to output is 2
+      // - the row values we get from nz_idx will be
+      //   0, 1, 2, 3, 4 ....
+      // - by shifting these values by first_row, the sequence becomes
+      //   -2, -1, 0, 1, 2 ...
+      // - so we will end up ignoring the first two input rows, and input rows 2..n will
+      //   get written to the output starting at position 0.
+      //
+      if (!has_repetition) { dst_pos -= s->first_row; }
+
+      // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
+      // before first_row) in the flat hierarchy case.
+      if (src_pos < target_pos && dst_pos >= 0) {
+        // src_pos represents the logical row position we want to read from. But in the case of
+        // nested hierarchies, there is no 1:1 mapping of rows to values.  So our true read position
+        // has to take into account the # of values we have to skip in the page to get to the
+        // desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
+        uint32_t val_src_pos = src_pos + skipped_leaf_values;
+
+        // nesting level that is storing actual leaf values
+        int leaf_level_index = s->col.max_nesting_depth - 1;
+
+        uint32_t dtype_len = s->dtype_len;
+        uint8_t const* src = s->data_start + val_src_pos;
+        uint8_t* dst =
+          nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
+        auto const is_decimal =
+          s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL;
+
+        // Note: non-decimal FIXED_LEN_BYTE_ARRAY will be handled in the string reader
+        if (is_decimal) {
+          switch (dtype) {
+            case INT32: gpuOutputByteStreamSplit<int32_t>(dst, src, num_values); break;
+            case INT64: gpuOutputByteStreamSplit<int64_t>(dst, src, num_values); break;
+            case FIXED_LEN_BYTE_ARRAY:
+              if (s->dtype_len_in <= sizeof(int32_t)) {
+                gpuOutputSplitFixedLenByteArrayAsInt(
+                  reinterpret_cast<int32_t*>(dst), src, num_values, s->dtype_len_in);
+                break;
+              } else if (s->dtype_len_in <= sizeof(int64_t)) {
+                gpuOutputSplitFixedLenByteArrayAsInt(
+                  reinterpret_cast<int64_t*>(dst), src, num_values, s->dtype_len_in);
+                break;
+              } else if (s->dtype_len_in <= sizeof(__int128_t)) {
+                gpuOutputSplitFixedLenByteArrayAsInt(
+                  reinterpret_cast<__int128_t*>(dst), src, num_values, s->dtype_len_in);
+                break;
+              }
+              // unsupported decimal precision
+              [[fallthrough]];
+
+            default: s->set_error_code(decode_error::UNSUPPORTED_ENCODING);
+          }
+        } else if (dtype_len == 8) {
+          if (s->dtype_len_in == 4) {
+            // Reading INT32 TIME_MILLIS into 64-bit DURATION_MILLISECONDS
+            // TIME_MILLIS is the only duration type stored as int32:
+            // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#deprecated-time-convertedtype
+            gpuOutputByteStreamSplit<int32_t>(dst, src, num_values);
+            // zero out most significant bytes
+            memset(dst + 4, 0, 4);
+          } else if (s->ts_scale) {
+            gpuOutputSplitInt64Timestamp(
+              reinterpret_cast<int64_t*>(dst), src, num_values, s->ts_scale);
+          } else {
+            gpuOutputByteStreamSplit<int64_t>(dst, src, num_values);
+          }
+        } else if (dtype_len == 4) {
+          gpuOutputByteStreamSplit<int32_t>(dst, src, num_values);
+        } else {
+          s->set_error_code(decode_error::UNSUPPORTED_ENCODING);
+        }
+      }
+
+      if (t == out_thread0) { s->src_pos = target_pos; }
+    }
+    __syncthreads();
+  }
+  if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
+}
+
 /**
  * @brief Kernel for computing the column data stored in the pages
  *
@@ -145,7 +316,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
       // - the row values we get from nz_idx will be
       //   0, 1, 2, 3, 4 ....
       // - by shifting these values by first_row, the sequence becomes
-      //   -1, -2, 0, 1, 2 ...
+      //   -2, -1, 0, 1, 2 ...
       // - so we will end up ignoring the first two input rows, and input rows 2..n will
       //   get written to the output starting at position 0.
       //
@@ -267,4 +438,29 @@ void __host__ DecodePageData(cudf::detail::hostdevice_span<PageInfo> pages,
   }
 }
 
+/**
+ * @copydoc cudf::io::parquet::detail::DecodePageData
+ */
+void __host__ DecodeSplitPageData(cudf::detail::hostdevice_span<PageInfo> pages,
+                                  cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
+                                  size_t num_rows,
+                                  size_t min_row,
+                                  int level_type_size,
+                                  kernel_error::pointer error_code,
+                                  rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
+
+  dim3 dim_block(decode_block_size, 1);
+  dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
+
+  if (level_type_size == 1) {
+    gpuDecodeSplitPageData<rolling_buf_size, uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+  } else {
+    gpuDecodeSplitPageData<rolling_buf_size, uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+  }
+}
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_data.cuh b/cpp/src/io/parquet/page_data.cuh
index df8d801d66c..f182747650e 100644
--- a/cpp/src/io/parquet/page_data.cuh
+++ b/cpp/src/io/parquet/page_data.cuh
@@ -396,4 +396,80 @@ inline __device__ void gpuOutputGeneric(
     }
   }
 }
+
+/**
+ * Output a BYTE_STREAM_SPLIT value of type `T`.
+ *
+ * Data is encoded as N == sizeof(T) streams of length M, forming an NxM sized matrix.
+ * Rows are streams, columns are individual values.
+ *
+ * @param dst pointer to output data
+ * @param src pointer to first byte of input data in stream 0
+ * @param stride number of bytes per input stream (M)
+ */
+template <typename T>
+__device__ inline void gpuOutputByteStreamSplit(uint8_t* dst, uint8_t const* src, size_type stride)
+{
+  for (int i = 0; i < sizeof(T); i++) {
+    dst[i] = src[i * stride];
+  }
+}
+
+/**
+ * Output a 64-bit BYTE_STREAM_SPLIT encoded timestamp.
+ *
+ * Data is encoded as N streams of length M, forming an NxM sized matrix. Rows are streams,
+ * columns are individual values.
+ *
+ * @param dst pointer to output data
+ * @param src pointer to first byte of input data in stream 0
+ * @param stride number of bytes per input stream (M)
+ * @param ts_scale timestamp scale
+ */
+inline __device__ void gpuOutputSplitInt64Timestamp(int64_t* dst,
+                                                    uint8_t const* src,
+                                                    size_type stride,
+                                                    int32_t ts_scale)
+{
+  gpuOutputByteStreamSplit<int64_t>(reinterpret_cast<uint8_t*>(dst), src, stride);
+  if (ts_scale < 0) {
+    // round towards negative infinity
+    int sign = (*dst < 0);
+    *dst     = ((*dst + sign) / -ts_scale) + sign;
+  } else {
+    *dst = *dst * ts_scale;
+  }
+}
+
+/**
+ * Output a BYTE_STREAM_SPLIT encoded decimal as an integer type.
+ *
+ * Data is encoded as N streams of length M, forming an NxM sized matrix. Rows are streams,
+ * columns are individual values.
+ *
+ * @param dst pointer to output data
+ * @param src pointer to first byte of input data in stream 0
+ * @param stride number of bytes per input stream (M)
+ * @param dtype_len_in length of the `FIXED_LEN_BYTE_ARRAY` used to represent the decimal
+ */
+template <typename T>
+__device__ void gpuOutputSplitFixedLenByteArrayAsInt(T* dst,
+                                                     uint8_t const* src,
+                                                     size_type stride,
+                                                     uint32_t dtype_len_in)
+{
+  T unscaled = 0;
+  // fixed_len_byte_array decimals are big endian
+  for (unsigned int i = 0; i < dtype_len_in; i++) {
+    unscaled = (unscaled << 8) | src[i * stride];
+  }
+  // Shift the unscaled value up and back down when it isn't all 8 bytes,
+  // which sign extend the value for correctly representing negative numbers.
+  if (dtype_len_in < sizeof(T)) {
+    unscaled <<= (sizeof(T) - dtype_len_in) * 8;
+    unscaled >>= (sizeof(T) - dtype_len_in) * 8;
+  }
+  *dst = unscaled;
+}
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 83bf7fb0d73..0c139fced24 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -1316,6 +1316,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           }
           break;
         case Encoding::PLAIN:
+        case Encoding::BYTE_STREAM_SPLIT:
           s->dict_size = static_cast<int32_t>(end - cur);
           s->dict_val  = 0;
           if (s->col.physical_type == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; }
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
index 7c0092c6185..da1bbaebd73 100644
--- a/cpp/src/io/parquet/page_delta_decode.cu
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -315,7 +315,7 @@ CUDF_KERNEL void __launch_bounds__(96)
   using cudf::detail::warp_size;
   __shared__ __align__(16) delta_binary_decoder db_state;
   __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s<delta_rolling_buf_size, 0, 0> state_buffers;
+  __shared__ __align__(16) page_state_buffers_s<delta_rolling_buf_size, 1, 1> state_buffers;
 
   page_state_s* const s = &state_g;
   auto* const sb        = &state_buffers;
@@ -440,7 +440,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   using cudf::detail::warp_size;
   __shared__ __align__(16) delta_byte_array_decoder db_state;
   __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s<delta_rolling_buf_size, 0, 0> state_buffers;
+  __shared__ __align__(16) page_state_buffers_s<delta_rolling_buf_size, 1, 1> state_buffers;
 
   page_state_s* const s = &state_g;
   auto* const sb        = &state_buffers;
@@ -605,7 +605,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   using cudf::detail::warp_size;
   __shared__ __align__(16) delta_binary_decoder db_state;
   __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s<delta_rolling_buf_size, 0, 0> state_buffers;
+  __shared__ __align__(16) page_state_buffers_s<delta_rolling_buf_size, 1, 1> state_buffers;
   __shared__ __align__(8) uint8_t const* page_string_data;
   __shared__ size_t string_offset;
 
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 2db6dc4270d..227f13db60e 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -15,6 +15,7 @@
  */
 
 #include "delta_enc.cuh"
+#include "io/parquet/parquet_gpu.hpp"
 #include "io/utilities/block_utils.cuh"
 #include "page_string_utils.cuh"
 #include "parquet_gpu.cuh"
@@ -238,8 +239,10 @@ void __device__ calculate_frag_size(frag_init_state_s* const s, int t)
 Encoding __device__ determine_encoding(PageType page_type,
                                        Type physical_type,
                                        bool use_dictionary,
-                                       bool write_v2_headers)
+                                       bool write_v2_headers,
+                                       bool is_split_stream)
 {
+  if (is_split_stream) { return Encoding::BYTE_STREAM_SPLIT; }
   // NOTE: For dictionary encoding, parquet v2 recommends using PLAIN in dictionary page and
   // RLE_DICTIONARY in data page, but parquet v1 uses PLAIN_DICTIONARY in both dictionary and
   // data pages (actual encoding is identical).
@@ -514,6 +517,7 @@ __device__ encode_kernel_mask data_encoding_for_col(EncColumnChunk const* chunk,
       case column_encoding::DELTA_BINARY_PACKED: return encode_kernel_mask::DELTA_BINARY;
       case column_encoding::DELTA_LENGTH_BYTE_ARRAY: return encode_kernel_mask::DELTA_LENGTH_BA;
       case column_encoding::DELTA_BYTE_ARRAY: return encode_kernel_mask::DELTA_BYTE_ARRAY;
+      case column_encoding::BYTE_STREAM_SPLIT: return encode_kernel_mask::BYTE_STREAM_SPLIT;
     }
   }
 
@@ -1608,6 +1612,19 @@ __device__ void finish_page_encode(state_buf* s,
   }
 }
 
+// Encode a fixed-width data type int `dst`. `dst` points to the first byte
+// of the result. `stride` is 1 for PLAIN encoding and num_values for
+// BYTE_STREAM_SPLIT.
+template <typename T>
+__device__ inline void encode_value(uint8_t* dst, T src, size_type stride)
+{
+  T v = src;
+  for (int i = 0; i < sizeof(T); i++) {
+    dst[i * stride] = v;
+    v >>= 8;
+  }
+}
+
 // PLAIN page data encoder
 // blockDim(128, 1, 1)
 template <int block_size>
@@ -1616,7 +1633,8 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
                  device_span<device_span<uint8_t const>> comp_in,
                  device_span<device_span<uint8_t>> comp_out,
                  device_span<compression_result> comp_results,
-                 bool write_v2_headers)
+                 bool write_v2_headers,
+                 bool is_split_stream)
 {
   __shared__ __align__(8) page_enc_state_s<0> state_g;
   using block_scan = cub::BlockScan<uint32_t, block_size>;
@@ -1636,7 +1654,9 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
   }
   __syncthreads();
 
-  if (BitAnd(s->page.kernel_mask, encode_kernel_mask::PLAIN) == 0) { return; }
+  auto const allowed_mask =
+    is_split_stream ? encode_kernel_mask::BYTE_STREAM_SPLIT : encode_kernel_mask::PLAIN;
+  if (BitAnd(s->page.kernel_mask, allowed_mask) == 0) { return; }
 
   // Encode data values
   __syncthreads();
@@ -1650,18 +1670,20 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
   }();
 
   if (t == 0) {
-    uint8_t* dst   = s->cur;
-    s->rle_run     = 0;
-    s->rle_pos     = 0;
-    s->rle_numvals = 0;
-    s->rle_out     = dst;
-    s->page.encoding =
-      determine_encoding(s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers);
+    uint8_t* dst     = s->cur;
+    s->rle_run       = 0;
+    s->rle_pos       = 0;
+    s->rle_numvals   = 0;
+    s->rle_out       = dst;
+    s->page.encoding = determine_encoding(
+      s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers, is_split_stream);
     s->page_start_val  = row_to_value_idx(s->page.start_row, s->col);
     s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col);
   }
   __syncthreads();
 
+  auto const stride = is_split_stream ? s->page.num_valid : 1;
+
   for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
     uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, block_size);
     uint32_t len, pos;
@@ -1708,6 +1730,13 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
     uint32_t total_len = 0;
     block_scan(scan_storage).ExclusiveSum(len, pos, total_len);
     __syncthreads();
+
+    // if BYTE_STREAM_SPLIT, then translate byte positions to indexes
+    if (is_split_stream) {
+      pos /= dtype_len_out;
+      total_len /= dtype_len_out;
+    }
+
     if (t == 0) { s->cur = dst + total_len; }
     if (is_valid) {
       switch (physical_type) {
@@ -1725,13 +1754,11 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
             }
           }();
 
-          dst[pos + 0] = v;
-          dst[pos + 1] = v >> 8;
-          dst[pos + 2] = v >> 16;
-          dst[pos + 3] = v >> 24;
+          encode_value(dst + pos, v, stride);
         } break;
+        case DOUBLE:
         case INT64: {
-          int64_t v        = s->col.leaf_column->element<int64_t>(val_idx);
+          auto v           = s->col.leaf_column->element<int64_t>(val_idx);
           int32_t ts_scale = s->col.ts_scale;
           if (ts_scale != 0) {
             if (ts_scale < 0) {
@@ -1740,16 +1767,10 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
               v *= ts_scale;
             }
           }
-          dst[pos + 0] = v;
-          dst[pos + 1] = v >> 8;
-          dst[pos + 2] = v >> 16;
-          dst[pos + 3] = v >> 24;
-          dst[pos + 4] = v >> 32;
-          dst[pos + 5] = v >> 40;
-          dst[pos + 6] = v >> 48;
-          dst[pos + 7] = v >> 56;
+          encode_value(dst + pos, v, stride);
         } break;
         case INT96: {
+          // only PLAIN encoding is supported
           int64_t v        = s->col.leaf_column->element<int64_t>(val_idx);
           int32_t ts_scale = s->col.ts_scale;
           if (ts_scale != 0) {
@@ -1776,27 +1797,14 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
           }();
 
           // the 12 bytes of fixed length data.
-          v             = last_day_nanos.count();
-          dst[pos + 0]  = v;
-          dst[pos + 1]  = v >> 8;
-          dst[pos + 2]  = v >> 16;
-          dst[pos + 3]  = v >> 24;
-          dst[pos + 4]  = v >> 32;
-          dst[pos + 5]  = v >> 40;
-          dst[pos + 6]  = v >> 48;
-          dst[pos + 7]  = v >> 56;
-          uint32_t w    = julian_days.count();
-          dst[pos + 8]  = w;
-          dst[pos + 9]  = w >> 8;
-          dst[pos + 10] = w >> 16;
-          dst[pos + 11] = w >> 24;
+          v = last_day_nanos.count();
+          encode_value(dst + pos, v, 1);
+          uint32_t w = julian_days.count();
+          encode_value(dst + pos + 8, w, 1);
         } break;
 
-        case DOUBLE: {
-          auto v = s->col.leaf_column->element<double>(val_idx);
-          memcpy(dst + pos, &v, 8);
-        } break;
         case BYTE_ARRAY: {
+          // only PLAIN encoding is supported
           auto const bytes = [](cudf::type_id const type_id,
                                 column_device_view const* leaf_column,
                                 uint32_t const val_idx) -> void const* {
@@ -1810,11 +1818,8 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
               default: CUDF_UNREACHABLE("invalid type id for byte array writing!");
             }
           }(type_id, s->col.leaf_column, val_idx);
-          uint32_t v   = len - 4;  // string length
-          dst[pos + 0] = v;
-          dst[pos + 1] = v >> 8;
-          dst[pos + 2] = v >> 16;
-          dst[pos + 3] = v >> 24;
+          uint32_t v = len - 4;  // string length
+          encode_value(dst + pos, v, 1);
           if (v != 0) memcpy(dst + pos + 4, bytes, v);
         } break;
         case FIXED_LEN_BYTE_ARRAY: {
@@ -1822,10 +1827,16 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
             // When using FIXED_LEN_BYTE_ARRAY for decimals, the rep is encoded in big-endian
             auto const v = s->col.leaf_column->element<numeric::decimal128>(val_idx).value();
             auto const v_char_ptr = reinterpret_cast<char const*>(&v);
-            thrust::copy(thrust::seq,
-                         thrust::make_reverse_iterator(v_char_ptr + sizeof(v)),
-                         thrust::make_reverse_iterator(v_char_ptr),
-                         dst + pos);
+            if (is_split_stream) {
+              for (int i = dtype_len_out - 1; i >= 0; i--, pos += stride) {
+                dst[pos] = v_char_ptr[i];
+              }
+            } else {
+              thrust::copy(thrust::seq,
+                           thrust::make_reverse_iterator(v_char_ptr + sizeof(v)),
+                           thrust::make_reverse_iterator(v_char_ptr),
+                           dst + pos);
+            }
           }
         } break;
       }
@@ -1833,6 +1844,9 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
     __syncthreads();
   }
 
+  // for BYTE_STREAM_SPLIT, s->cur now points to the end of the first stream.
+  // need it to point to the end of the Nth stream.
+  if (is_split_stream and t == 0) { s->cur += (dtype_len_out - 1) * s->page.num_valid; }
   finish_page_encode<block_size>(
     s, s->cur, pages, comp_in, comp_out, comp_results, write_v2_headers);
 }
@@ -1883,13 +1897,13 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
                            ? s->ck.dict_rle_bits
                            : -1;
   if (t == 0) {
-    uint8_t* dst   = s->cur;
-    s->rle_run     = 0;
-    s->rle_pos     = 0;
-    s->rle_numvals = 0;
-    s->rle_out     = dst;
-    s->page.encoding =
-      determine_encoding(s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers);
+    uint8_t* dst     = s->cur;
+    s->rle_run       = 0;
+    s->rle_pos       = 0;
+    s->rle_numvals   = 0;
+    s->rle_out       = dst;
+    s->page.encoding = determine_encoding(
+      s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers, false);
     if (dict_bits >= 0 && physical_type != BOOLEAN) {
       dst[0]     = dict_bits;
       s->rle_out = dst + 1;
@@ -3417,7 +3431,14 @@ void EncodePages(device_span<EncPage> pages,
     gpuEncodePageLevels<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
       pages, write_v2_headers, encode_kernel_mask::PLAIN);
     gpuEncodePages<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
-      pages, comp_in, comp_out, comp_results, write_v2_headers);
+      pages, comp_in, comp_out, comp_results, write_v2_headers, false);
+  }
+  if (BitAnd(kernel_mask, encode_kernel_mask::BYTE_STREAM_SPLIT) != 0) {
+    auto const strm = streams[s_idx++];
+    gpuEncodePageLevels<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, write_v2_headers, encode_kernel_mask::BYTE_STREAM_SPLIT);
+    gpuEncodePages<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, comp_in, comp_out, comp_results, write_v2_headers, true);
   }
   if (BitAnd(kernel_mask, encode_kernel_mask::DELTA_BINARY) != 0) {
     auto const strm = streams[s_idx++];
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 07e03460ecb..6c6afde29e4 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -166,13 +166,7 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page,
                                                    ColumnChunkDesc const& chunk)
 {
   if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return decode_kernel_mask::NONE; }
-  if (!is_string_col(chunk) && !is_nested(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) {
-    if (page.encoding == Encoding::PLAIN) {
-      return decode_kernel_mask::FIXED_WIDTH_NO_DICT;
-    } else if (page.encoding == Encoding::PLAIN_DICTIONARY) {
-      return decode_kernel_mask::FIXED_WIDTH_DICT;
-    }
-  }
+
   if (page.encoding == Encoding::DELTA_BINARY_PACKED) {
     return decode_kernel_mask::DELTA_BINARY;
   } else if (page.encoding == Encoding::DELTA_BYTE_ARRAY) {
@@ -180,10 +174,26 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page,
   } else if (page.encoding == Encoding::DELTA_LENGTH_BYTE_ARRAY) {
     return decode_kernel_mask::DELTA_LENGTH_BA;
   } else if (is_string_col(chunk)) {
+    // check for string before byte_stream_split so FLBA will go to the right kernel
     return decode_kernel_mask::STRING;
   }
 
-  // non-string, non-delta
+  if (!is_nested(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) {
+    if (page.encoding == Encoding::PLAIN) {
+      return decode_kernel_mask::FIXED_WIDTH_NO_DICT;
+    } else if (page.encoding == Encoding::PLAIN_DICTIONARY ||
+               page.encoding == Encoding::RLE_DICTIONARY) {
+      return decode_kernel_mask::FIXED_WIDTH_DICT;
+    } else if (page.encoding == Encoding::BYTE_STREAM_SPLIT) {
+      return decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT;
+    }
+  }
+
+  if (page.encoding == Encoding::BYTE_STREAM_SPLIT) {
+    return decode_kernel_mask::BYTE_STREAM_SPLIT;
+  }
+
+  // non-string, non-delta, non-split_stream
   return decode_kernel_mask::GENERAL;
 }
 
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 6f96d4dd1cf..5ba813f518f 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -1039,7 +1039,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
       // - the row values we get from nz_idx will be
       //   0, 1, 2, 3, 4 ....
       // - by shifting these values by first_row, the sequence becomes
-      //   -1, -2, 0, 1, 2 ...
+      //   -2, -1, 0, 1, 2 ...
       // - so we will end up ignoring the first two input rows, and input rows 2..n will
       //   get written to the output starting at position 0.
       //
@@ -1062,7 +1062,19 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
           // choose a character parallel string copy when the average string is longer than a warp
           auto const use_char_ll = warp_total / warp_size >= warp_size;
 
-          if (use_char_ll) {
+          if (s->page.encoding == Encoding::BYTE_STREAM_SPLIT) {
+            if (src_pos + i < target_pos && dst_pos >= 0) {
+              auto const stride = s->page.str_bytes / s->dtype_len_in;
+              auto offptr =
+                reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) + dst_pos;
+              *offptr      = len;
+              auto str_ptr = nesting_info_base[leaf_level_index].string_out + offset;
+              for (int ii = 0; ii < s->dtype_len_in; ii++) {
+                str_ptr[ii] = s->data_start[src_pos + i + ii * stride];
+              }
+            }
+            __syncwarp();
+          } else if (use_char_ll) {
             __shared__ __align__(8) uint8_t const* pointers[warp_size];
             __shared__ __align__(4) size_type offsets[warp_size];
             __shared__ __align__(4) int dsts[warp_size];
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index b165c60b2cf..c06fb63acda 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -54,7 +54,13 @@ constexpr int LEVEL_DECODE_BUF_SIZE = 2048;
 template <int rolling_size>
 constexpr int rolling_index(int index)
 {
-  return index % rolling_size;
+  // Cannot divide by 0. But `rolling_size` will be 0 for unused arrays, so this case will never
+  // actual be executed.
+  if constexpr (rolling_size == 0) {
+    return index;
+  } else {
+    return index % rolling_size;
+  }
 }
 
 // PARQUET-2261 allows for not writing the level histograms in certain cases.
@@ -81,7 +87,8 @@ constexpr bool is_supported_encoding(Encoding enc)
     case Encoding::RLE_DICTIONARY:
     case Encoding::DELTA_BINARY_PACKED:
     case Encoding::DELTA_LENGTH_BYTE_ARRAY:
-    case Encoding::DELTA_BYTE_ARRAY: return true;
+    case Encoding::DELTA_BYTE_ARRAY:
+    case Encoding::BYTE_STREAM_SPLIT: return true;
     default: return false;
   }
 }
@@ -199,14 +206,16 @@ enum level_type {
  * Used to control which decode kernels to run.
  */
 enum class decode_kernel_mask {
-  NONE                = 0,
-  GENERAL             = (1 << 0),  // Run catch-all decode kernel
-  STRING              = (1 << 1),  // Run decode kernel for string data
-  DELTA_BINARY        = (1 << 2),  // Run decode kernel for DELTA_BINARY_PACKED data
-  DELTA_BYTE_ARRAY    = (1 << 3),  // Run decode kernel for DELTA_BYTE_ARRAY encoded data
-  DELTA_LENGTH_BA     = (1 << 4),  // Run decode kernel for DELTA_LENGTH_BYTE_ARRAY encoded data
-  FIXED_WIDTH_NO_DICT = (1 << 5),  // Run decode kernel for fixed width non-dictionary pages
-  FIXED_WIDTH_DICT    = (1 << 6)   // Run decode kernel for fixed width dictionary pages
+  NONE                   = 0,
+  GENERAL                = (1 << 0),  // Run catch-all decode kernel
+  STRING                 = (1 << 1),  // Run decode kernel for string data
+  DELTA_BINARY           = (1 << 2),  // Run decode kernel for DELTA_BINARY_PACKED data
+  DELTA_BYTE_ARRAY       = (1 << 3),  // Run decode kernel for DELTA_BYTE_ARRAY encoded data
+  DELTA_LENGTH_BA        = (1 << 4),  // Run decode kernel for DELTA_LENGTH_BYTE_ARRAY encoded data
+  FIXED_WIDTH_NO_DICT    = (1 << 5),  // Run decode kernel for fixed width non-dictionary pages
+  FIXED_WIDTH_DICT       = (1 << 6),  // Run decode kernel for fixed width dictionary pages
+  BYTE_STREAM_SPLIT      = (1 << 7),  // Run decode kernel for BYTE_STREAM_SPLIT encoded data
+  BYTE_STREAM_SPLIT_FLAT = (1 << 8),  // Same as above but with a flat schema
 };
 
 // mask representing all the ways in which a string can be encoded
@@ -517,11 +526,12 @@ constexpr uint32_t encoding_to_mask(Encoding encoding)
  * Used to control which encode kernels to run.
  */
 enum class encode_kernel_mask {
-  PLAIN            = (1 << 0),  // Run plain encoding kernel
-  DICTIONARY       = (1 << 1),  // Run dictionary encoding kernel
-  DELTA_BINARY     = (1 << 2),  // Run DELTA_BINARY_PACKED encoding kernel
-  DELTA_LENGTH_BA  = (1 << 3),  // Run DELTA_LENGTH_BYTE_ARRAY encoding kernel
-  DELTA_BYTE_ARRAY = (1 << 4),  // Run DELTA_BYtE_ARRAY encoding kernel
+  PLAIN             = (1 << 0),  // Run plain encoding kernel
+  DICTIONARY        = (1 << 1),  // Run dictionary encoding kernel
+  DELTA_BINARY      = (1 << 2),  // Run DELTA_BINARY_PACKED encoding kernel
+  DELTA_LENGTH_BA   = (1 << 3),  // Run DELTA_LENGTH_BYTE_ARRAY encoding kernel
+  DELTA_BYTE_ARRAY  = (1 << 4),  // Run DELTA_BYtE_ARRAY encoding kernel
+  BYTE_STREAM_SPLIT = (1 << 5),  // Run plain encoding kernel, but split streams
 };
 
 /**
@@ -759,6 +769,28 @@ void DecodePageData(cudf::detail::hostdevice_span<PageInfo> pages,
                     kernel_error::pointer error_code,
                     rmm::cuda_stream_view stream);
 
+/**
+ * @brief Launches kernel for reading the BYTE_STREAM_SPLIT column data stored in the pages
+ *
+ * The page data will be written to the output pointed to in the page's
+ * associated column chunk.
+ *
+ * @param[in,out] pages All pages to be decoded
+ * @param[in] chunks All chunks to be decoded
+ * @param[in] num_rows Total number of rows to read
+ * @param[in] min_row Minimum number of rows to read
+ * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[out] error_code Error code for kernel failures
+ * @param[in] stream CUDA stream to use
+ */
+void DecodeSplitPageData(cudf::detail::hostdevice_span<PageInfo> pages,
+                         cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
+                         size_t num_rows,
+                         size_t min_row,
+                         int level_type_size,
+                         kernel_error::pointer error_code,
+                         rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for reading the string column data stored in the pages
  *
@@ -891,6 +923,28 @@ void DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pages,
                              kernel_error::pointer error_code,
                              rmm::cuda_stream_view stream);
 
+/**
+ * @brief Launches kernel for reading dictionary fixed width column data stored in the pages
+ *
+ * The page data will be written to the output pointed to in the page's
+ * associated column chunk.
+ *
+ * @param[in,out] pages All pages to be decoded
+ * @param[in] chunks All chunks to be decoded
+ * @param[in] num_rows Total number of rows to read
+ * @param[in] min_row Minimum number of rows to read
+ * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[out] error_code Error code for kernel failures
+ * @param[in] stream CUDA stream to use
+ */
+void DecodeSplitPageDataFlat(cudf::detail::hostdevice_span<PageInfo> pages,
+                             cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
+                             std::size_t num_rows,
+                             size_t min_row,
+                             int level_type_size,
+                             kernel_error::pointer error_code,
+                             rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for initializing encoder row group fragments
  *
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index a524e7c6dcc..b7172f5ba67 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -253,6 +253,28 @@ void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_row
                       streams[s_idx++]);
   }
 
+  // launch byte stream split decoder
+  if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT) != 0) {
+    DecodeSplitPageDataFlat(subpass.pages,
+                            pass.chunks,
+                            num_rows,
+                            skip_rows,
+                            level_type_size,
+                            error_code.data(),
+                            streams[s_idx++]);
+  }
+
+  // launch byte stream split decoder
+  if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT) != 0) {
+    DecodeSplitPageData(subpass.pages,
+                        pass.chunks,
+                        num_rows,
+                        skip_rows,
+                        level_type_size,
+                        error_code.data(),
+                        streams[s_idx++]);
+  }
+
   if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT) != 0) {
     DecodePageDataFixed(subpass.pages,
                         pass.chunks,
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 6a8c31fb96b..5509a33f9f0 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -613,8 +613,7 @@ std::vector<schema_tree_node> construct_schema_tree(
                                                 column_in_metadata const& col_meta) {
         s.requested_encoding = column_encoding::USE_DEFAULT;
 
-        if (schema[parent_idx].name != "list" and
-            col_meta.get_encoding() != column_encoding::USE_DEFAULT) {
+        if (s.name != "list" and col_meta.get_encoding() != column_encoding::USE_DEFAULT) {
           // do some validation
           switch (col_meta.get_encoding()) {
             case column_encoding::DELTA_BINARY_PACKED:
@@ -659,6 +658,21 @@ std::vector<schema_tree_node> construct_schema_tree(
               }
               break;
 
+            case column_encoding::BYTE_STREAM_SPLIT:
+              if (s.type == Type::BYTE_ARRAY) {
+                CUDF_LOG_WARN(
+                  "BYTE_STREAM_SPLIT encoding is only supported for fixed width columns; the "
+                  "requested encoding will be ignored");
+                return;
+              }
+              if (s.type == Type::INT96) {
+                CUDF_LOG_WARN(
+                  "BYTE_STREAM_SPLIT encoding is not supported for INT96 columns; the "
+                  "requested encoding will be ignored");
+                return;
+              }
+              break;
+
             // supported parquet encodings
             case column_encoding::PLAIN:
             case column_encoding::DICTIONARY: break;
diff --git a/cpp/tests/io/parquet_common.cpp b/cpp/tests/io/parquet_common.cpp
index b64cd230bc6..c1211869bcc 100644
--- a/cpp/tests/io/parquet_common.cpp
+++ b/cpp/tests/io/parquet_common.cpp
@@ -203,6 +203,7 @@ template std::vector<int8_t> random_values<int8_t>(size_t size);
 template std::vector<int16_t> random_values<int16_t>(size_t size);
 template std::vector<int32_t> random_values<int32_t>(size_t size);
 template std::vector<int64_t> random_values<int64_t>(size_t size);
+template std::vector<__int128_t> random_values<__int128_t>(size_t size);
 template std::vector<uint8_t> random_values<uint8_t>(size_t size);
 template std::vector<uint16_t> random_values<uint16_t>(size_t size);
 template std::vector<uint32_t> random_values<uint32_t>(size_t size);
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index 3a3040f0957..a16b3d63177 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -35,7 +35,7 @@
 using cudf::test::iterators::no_nulls;
 
 template <typename mask_op_t>
-void test_durations(mask_op_t mask_op)
+void test_durations(mask_op_t mask_op, bool use_byte_stream_split)
 {
   std::default_random_engine generator;
   std::uniform_int_distribution<int> distribution_d(0, 30);
@@ -67,6 +67,13 @@ void test_durations(mask_op_t mask_op)
 
   auto expected = table_view{{durations_d, durations_s, durations_ms, durations_us, durations_ns}};
 
+  if (use_byte_stream_split) {
+    cudf::io::table_input_metadata expected_metadata(expected);
+    for (auto& col_meta : expected_metadata.column_metadata) {
+      col_meta.set_encoding(cudf::io::column_encoding::BYTE_STREAM_SPLIT);
+    }
+  }
+
   auto filepath = temp_env->get_temp_filepath("Durations.parquet");
   cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
@@ -91,10 +98,10 @@ void test_durations(mask_op_t mask_op)
 
 TEST_F(ParquetWriterTest, Durations)
 {
-  test_durations([](auto i) { return true; });
-  test_durations([](auto i) { return (i % 2) != 0; });
-  test_durations([](auto i) { return (i % 3) != 0; });
-  test_durations([](auto i) { return false; });
+  test_durations([](auto i) { return true; }, false);
+  test_durations([](auto i) { return (i % 2) != 0; }, false);
+  test_durations([](auto i) { return (i % 3) != 0; }, false);
+  test_durations([](auto i) { return false; }, false);
 }
 
 TEST_F(ParquetWriterTest, MultiIndex)
@@ -1593,6 +1600,7 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings)
   using cudf::io::column_encoding;
   using cudf::io::parquet::detail::Encoding;
   constexpr int num_rows = 500;
+  std::mt19937 engine{31337};
 
   auto const ones = thrust::make_constant_iterator(1);
   auto const col =
@@ -1602,6 +1610,9 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings)
   auto const string_col =
     cudf::test::strings_column_wrapper(strings, strings + num_rows, no_nulls());
 
+  // throw in a list to make sure encoding selection works there too
+  auto list_col = make_parquet_list_col<int32_t>(engine, num_rows, 5, true);
+
   auto const table = table_view({col,
                                  col,
                                  col,
@@ -1613,7 +1624,8 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings)
                                  string_col,
                                  string_col,
                                  string_col,
-                                 string_col});
+                                 string_col,
+                                 *list_col});
 
   cudf::io::table_input_metadata table_metadata(table);
 
@@ -1635,10 +1647,17 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings)
   set_meta(10, "string_db", column_encoding::DELTA_BINARY_PACKED);
   table_metadata.column_metadata[11].set_name("string_none");
 
-  for (auto& col_meta : table_metadata.column_metadata) {
-    col_meta.set_nullability(false);
+  for (int i = 0; i < 12; i++) {
+    table_metadata.column_metadata[i].set_nullability(false);
   }
 
+  // handle list column separately
+  table_metadata.column_metadata[12].set_name("int32_list").set_nullability(true);
+  table_metadata.column_metadata[12]
+    .child(1)
+    .set_encoding(column_encoding::DELTA_BINARY_PACKED)
+    .set_nullability(true);
+
   auto const filepath = temp_env->get_temp_filepath("UserRequestedEncodings.parquet");
   cudf::io::parquet_writer_options opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table)
@@ -1683,6 +1702,12 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings)
   expect_enc(10, Encoding::PLAIN_DICTIONARY);
   // no request, should use dictionary
   expect_enc(11, Encoding::PLAIN_DICTIONARY);
+  // int list requested delta_binary_packed. it's has level data, so have to search for a match.
+  auto const encodings = fmd.row_groups[0].columns[12].meta_data.encodings;
+  auto const has_delta = std::any_of(encodings.begin(), encodings.end(), [](Encoding enc) {
+    return enc == Encoding::DELTA_BINARY_PACKED;
+  });
+  EXPECT_TRUE(has_delta);
 }
 
 TEST_F(ParquetWriterTest, Decimal128DeltaByteArray)
@@ -1743,6 +1768,95 @@ TEST_F(ParquetWriterTest, DeltaBinaryStartsWithNulls)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
+TEST_F(ParquetWriterTest, ByteStreamSplit)
+{
+  constexpr auto num_rows = 100;
+  std::mt19937 engine{31337};
+  auto col0_data = random_values<int32_t>(num_rows);
+  auto col1_data = random_values<int64_t>(num_rows);
+  auto col2_data = random_values<float>(num_rows);
+  auto col3_data = random_values<double>(num_rows);
+
+  column_wrapper<int32_t> col0{col0_data.begin(), col0_data.end(), no_nulls()};
+  column_wrapper<int64_t> col1{col1_data.begin(), col1_data.end(), no_nulls()};
+  column_wrapper<float> col2{col2_data.begin(), col2_data.end(), no_nulls()};
+  column_wrapper<double> col3{col3_data.begin(), col3_data.end(), no_nulls()};
+
+  // throw in a list to make sure both decoders are working
+  auto col4 = make_parquet_list_col<int32_t>(engine, num_rows, 5, true);
+
+  auto expected = table_view{{col0, col1, col2, col3, *col4}};
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("int32s");
+  expected_metadata.column_metadata[1].set_name("int64s");
+  expected_metadata.column_metadata[2].set_name("floats");
+  expected_metadata.column_metadata[3].set_name("doubles");
+  expected_metadata.column_metadata[4].set_name("int32list");
+  auto const encoding = cudf::io::column_encoding::BYTE_STREAM_SPLIT;
+  for (int i = 0; i <= 3; i++) {
+    expected_metadata.column_metadata[i].set_encoding(encoding);
+  }
+
+  expected_metadata.column_metadata[4].child(1).set_encoding(encoding);
+
+  auto const filepath = temp_env->get_temp_filepath("ByteStreamSplit.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(expected_metadata);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
+TEST_F(ParquetWriterTest, DecimalByteStreamSplit)
+{
+  constexpr cudf::size_type num_rows = 100;
+  auto seq_col0                      = random_values<int32_t>(num_rows);
+  auto seq_col1                      = random_values<int64_t>(num_rows);
+  auto seq_col2                      = random_values<__int128_t>(num_rows);
+
+  auto col0 = cudf::test::fixed_point_column_wrapper<int32_t>{
+    seq_col0.begin(), seq_col0.end(), no_nulls(), numeric::scale_type{-5}};
+  auto col1 = cudf::test::fixed_point_column_wrapper<int64_t>{
+    seq_col1.begin(), seq_col1.end(), no_nulls(), numeric::scale_type{-9}};
+  auto col2 = cudf::test::fixed_point_column_wrapper<__int128_t>{
+    seq_col1.begin(), seq_col1.end(), no_nulls(), numeric::scale_type{-11}};
+
+  auto expected = table_view({col0, col1, col2});
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("int32s").set_decimal_precision(7);
+  expected_metadata.column_metadata[1].set_name("int64s").set_decimal_precision(11);
+  expected_metadata.column_metadata[2].set_name("int128s").set_decimal_precision(22);
+  for (auto& col_meta : expected_metadata.column_metadata) {
+    col_meta.set_encoding(cudf::io::column_encoding::BYTE_STREAM_SPLIT);
+  }
+
+  auto const filepath = temp_env->get_temp_filepath("DecimalByteStreamSplit.parquet");
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(expected_metadata);
+  cudf::io::write_parquet(args);
+
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
+TEST_F(ParquetWriterTest, DurationByteStreamSplit)
+{
+  test_durations([](auto i) { return true; }, true);
+  test_durations([](auto i) { return (i % 2) != 0; }, true);
+  test_durations([](auto i) { return (i % 3) != 0; }, true);
+  test_durations([](auto i) { return false; }, true);
+}
+
 /////////////////////////////////////////////////////////////
 // custom mem mapped data sink that supports device writes
 template <bool supports_device_writes>
@@ -1926,6 +2040,35 @@ TYPED_TEST(ParquetWriterTimestampTypeTest, TimestampOverflow)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
+TYPED_TEST(ParquetWriterTimestampTypeTest, TimestampsByteStreamSplit)
+{
+  srand(42);
+  auto sequence = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return ((std::rand() / 10000) * 1000); });
+
+  constexpr auto num_rows = 100;
+  column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
+    sequence, sequence + num_rows, no_nulls());
+
+  auto expected = table_view{{col}};
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_encoding(cudf::io::column_encoding::BYTE_STREAM_SPLIT);
+
+  auto filepath = temp_env->get_temp_filepath("TimestampsByteStreamSplit.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(expected_metadata);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+      .timestamp_type(this->type());
+  auto result = cudf::io::read_parquet(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
 //////////////////////////////
 // writer stress tests
 

From 2eb71b28d9607e3dfa5b891cbc40ce53a5d27bc6 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 24 Apr 2024 16:05:34 -0400
Subject: [PATCH 122/272] Large strings gtest fixture and utilities (#15513)

Creates the base class and utilities for testing APIs to produce large strings.
The main purpose of the fixture is to enable the large strings environment variable(s) and to setup large test data that can be reused by multiple tests.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15513
---
 cpp/include/cudf_test/testing_main.hpp        |  37 ++++--
 cpp/tests/CMakeLists.txt                      |   9 ++
 cpp/tests/copying/concatenate_tests.cpp       |  43 ------
 cpp/tests/large_strings/concatenate_tests.cpp |  65 ++++++++++
 .../large_strings/large_strings_fixture.cpp   | 122 ++++++++++++++++++
 .../large_strings/large_strings_fixture.hpp   |  49 +++++++
 cpp/tests/large_strings/merge_tests.cpp       |  79 ++++++++++++
 cpp/tests/merge/merge_string_test.cpp         |  57 --------
 8 files changed, 351 insertions(+), 110 deletions(-)
 create mode 100644 cpp/tests/large_strings/concatenate_tests.cpp
 create mode 100644 cpp/tests/large_strings/large_strings_fixture.cpp
 create mode 100644 cpp/tests/large_strings/large_strings_fixture.hpp
 create mode 100644 cpp/tests/large_strings/merge_tests.cpp

diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp
index ecac761f7cb..66b831b917f 100644
--- a/cpp/include/cudf_test/testing_main.hpp
+++ b/cpp/include/cudf_test/testing_main.hpp
@@ -145,6 +145,25 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
   }
 }
 
+/**
+ * @brief Sets up stream mode memory resource adaptor
+ *
+ * The resource adaptor is only set as the current device resource if the
+ * stream mode is enabled.
+ *
+ * The caller must keep the return object alive for the life of the test runs.
+ *
+ * @param cmd_opts Command line options returned by parse_cudf_test_opts
+ * @return Memory resource adaptor
+ */
+inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts)
+{
+  auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();
+  auto resource       = cudf::test::create_memory_resource(rmm_mode);
+  rmm::mr::set_current_device_resource(resource.get());
+  return resource;
+}
+
 /**
  * @brief Sets up stream mode memory resource adaptor
  *
@@ -181,14 +200,12 @@ inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts)
  * function parses the command line to customize test behavior, like the
  * allocation mode used for creating the default memory resource.
  */
-#define CUDF_TEST_PROGRAM_MAIN()                                        \
-  int main(int argc, char** argv)                                       \
-  {                                                                     \
-    ::testing::InitGoogleTest(&argc, argv);                             \
-    auto const cmd_opts = parse_cudf_test_opts(argc, argv);             \
-    auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();       \
-    auto resource       = cudf::test::create_memory_resource(rmm_mode); \
-    rmm::mr::set_current_device_resource(resource.get());               \
-    auto adaptor = make_stream_mode_adaptor(cmd_opts);                  \
-    return RUN_ALL_TESTS();                                             \
+#define CUDF_TEST_PROGRAM_MAIN()                                            \
+  int main(int argc, char** argv)                                           \
+  {                                                                         \
+    ::testing::InitGoogleTest(&argc, argv);                                 \
+    auto const cmd_opts           = parse_cudf_test_opts(argc, argv);       \
+    [[maybe_unused]] auto mr      = make_memory_resource_adaptor(cmd_opts); \
+    [[maybe_unused]] auto adaptor = make_stream_mode_adaptor(cmd_opts);     \
+    return RUN_ALL_TESTS();                                                 \
   }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index f59e675e1d5..6c56d82007a 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -568,6 +568,15 @@ ConfigureTest(
   strings/urls_tests.cpp
 )
 
+# ##################################################################################################
+# * large strings test ----------------------------------------------------------------------------
+ConfigureTest(
+  LARGE_STRINGS_TEST large_strings/large_strings_fixture.cpp large_strings/merge_tests.cpp
+  large_strings/concatenate_tests.cpp
+  GPUS 1
+  PERCENT 100
+)
+
 # ##################################################################################################
 # * json path test --------------------------------------------------------------------------------
 ConfigureTest(JSON_PATH_TEST json/json_tests.cpp)
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index 3e2e332936e..c2d1e1d9f4f 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -197,49 +197,6 @@ TEST_F(StringColumnTest, ConcatenateTooLarge)
   EXPECT_THROW(cudf::concatenate(input_cols), std::overflow_error);
 }
 
-TEST_F(StringColumnTest, ConcatenateLargeStrings)
-{
-  CUDF_TEST_ENABLE_LARGE_STRINGS();
-  auto itr = thrust::constant_iterator<std::string_view>(
-    "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY");                // 50 bytes
-  auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000);  // 250MB
-  auto view  = cudf::column_view(input);
-  std::vector<cudf::column_view> input_cols;
-  std::vector<cudf::size_type> splits;
-  int const multiplier = 10;
-  for (int i = 0; i < multiplier; ++i) {  // 2500MB > 2GB
-    input_cols.push_back(view);
-    splits.push_back(view.size() * (i + 1));
-  }
-  splits.pop_back();  // remove last entry
-  auto result = cudf::concatenate(input_cols);
-  auto sv     = cudf::strings_column_view(result->view());
-  EXPECT_EQ(sv.size(), view.size() * multiplier);
-  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
-
-  // verify results in sections
-  auto sliced = cudf::split(result->view(), splits);
-  for (auto c : sliced) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
-  }
-
-  // also test with large strings column as input
-  {
-    input_cols.clear();
-    input_cols.push_back(input);           // regular column
-    input_cols.push_back(result->view());  // large column
-    result = cudf::concatenate(input_cols);
-    sv     = cudf::strings_column_view(result->view());
-    EXPECT_EQ(sv.size(), view.size() * (multiplier + 1));
-    EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
-    splits.push_back(view.size() * multiplier);
-    sliced = cudf::split(result->view(), splits);
-    for (auto c : sliced) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
-    }
-  }
-}
-
 struct TableTest : public cudf::test::BaseFixture {};
 
 TEST_F(TableTest, ConcatenateTables)
diff --git a/cpp/tests/large_strings/concatenate_tests.cpp b/cpp/tests/large_strings/concatenate_tests.cpp
new file mode 100644
index 00000000000..aa445bf761b
--- /dev/null
+++ b/cpp/tests/large_strings/concatenate_tests.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+
+#include <cudf/concatenate.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <vector>
+
+struct ConcatenateTest : public cudf::test::StringsLargeTest {};
+
+TEST_F(ConcatenateTest, ConcatenateVertical)
+{
+  auto input = this->long_column();
+  auto view  = cudf::column_view(input);
+  std::vector<cudf::column_view> input_cols;
+  std::vector<cudf::size_type> splits;
+  int const multiplier = 10;
+  for (int i = 0; i < multiplier; ++i) {  // 2500MB > 2GB
+    input_cols.push_back(view);
+    splits.push_back(view.size() * (i + 1));
+  }
+  splits.pop_back();  // remove last entry
+  auto result = cudf::concatenate(input_cols);
+  auto sv     = cudf::strings_column_view(result->view());
+  EXPECT_EQ(sv.size(), view.size() * multiplier);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+
+  // verify results in sections
+  auto sliced = cudf::split(result->view(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+
+  // also test with large strings column as input
+  input_cols.clear();
+  input_cols.push_back(input);           // regular column
+  input_cols.push_back(result->view());  // large column
+  result = cudf::concatenate(input_cols);
+  sv     = cudf::strings_column_view(result->view());
+  EXPECT_EQ(sv.size(), view.size() * (multiplier + 1));
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+  splits.push_back(view.size() * multiplier);
+  sliced = cudf::split(result->view(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+}
diff --git a/cpp/tests/large_strings/large_strings_fixture.cpp b/cpp/tests/large_strings/large_strings_fixture.cpp
new file mode 100644
index 00000000000..59e0cd43d05
--- /dev/null
+++ b/cpp/tests/large_strings/large_strings_fixture.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/testing_main.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/repeat_strings.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <map>
+#include <memory>
+#include <vector>
+
+namespace cudf::test {
+class LargeStringsData {
+ public:
+  using DataPointer = std::unique_ptr<cudf::table>;
+
+  virtual ~LargeStringsData() {}
+
+  void add_table(std::string_view name, std::unique_ptr<cudf::table>&& data)
+  {
+    _data[std::string(name)] = std::move(data);
+  }
+
+  cudf::table_view get_table(std::string_view name) const
+  {
+    std::string key{name};
+    return _data.find(key) != _data.end() ? _data.at(key)->view() : cudf::table_view{};
+  }
+
+  void add_column(std::string_view name, std::unique_ptr<cudf::column>&& data)
+  {
+    std::vector<std::unique_ptr<cudf::column>> cols;
+    cols.emplace_back(std::move(data));
+    _data[std::string(name)] = std::make_unique<cudf::table>(std::move(cols));
+  }
+
+  cudf::column_view get_column(std::string_view name) const
+  {
+    std::string key{name};
+    return _data.find(key) != _data.end() ? _data.at(key)->view().column(0) : cudf::column_view{};
+  }
+
+  bool has_key(std::string_view name) const { return _data.find(std::string(name)) != _data.end(); }
+
+ protected:
+  std::map<std::string, DataPointer> _data;
+};
+
+cudf::column_view StringsLargeTest::wide_column()
+{
+  std::string name{"wide1"};
+  if (!g_ls_data->has_key(name)) {
+    auto input =
+      cudf::test::strings_column_wrapper({"the quick brown fox jumps over the lazy dog",
+                                          "the fat cat lays next to the other accénted cat",
+                                          "a slow moving turtlé cannot catch the bird",
+                                          "which can be composéd together to form a more complete",
+                                          "The result does not include the value in the sum in"});
+    auto counts = cudf::test::fixed_width_column_wrapper<int>({8, 8, 8, 8, 8});
+    auto result = cudf::strings::repeat_strings(cudf::strings_column_view(input), counts);
+    g_ls_data->add_column(name, std::move(result));
+  }
+  return g_ls_data->get_column(name);
+}
+
+cudf::column_view StringsLargeTest::long_column()
+{
+  std::string name("long1");
+  if (!g_ls_data->has_key(name)) {
+    auto itr = thrust::constant_iterator<std::string_view>(
+      "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY");                // 50 bytes
+    auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000);  // 250MB
+    g_ls_data->add_column(name, input.release());
+  }
+  return g_ls_data->get_column(name);
+}
+
+std::unique_ptr<LargeStringsData> StringsLargeTest::get_ls_data()
+{
+  CUDF_EXPECTS(g_ls_data == nullptr, "invalid call to get_ls_data");
+  auto lsd_data = std::make_unique<LargeStringsData>();
+  g_ls_data     = lsd_data.get();
+  return lsd_data;
+}
+
+LargeStringsData* StringsLargeTest::g_ls_data = nullptr;
+}  // namespace cudf::test
+
+int main(int argc, char** argv)
+{
+  ::testing::InitGoogleTest(&argc, argv);
+  auto const cmd_opts = parse_cudf_test_opts(argc, argv);
+  // hardcoding the CUDA memory resource to keep from exceeding the pool
+  auto mr = cudf::test::make_cuda();
+  rmm::mr::set_current_device_resource(mr.get());
+  auto adaptor = make_stream_mode_adaptor(cmd_opts);
+
+  // create object to automatically be destroyed at the end of main()
+  auto lsd = cudf::test::StringsLargeTest::get_ls_data();
+
+  return RUN_ALL_TESTS();
+}
diff --git a/cpp/tests/large_strings/large_strings_fixture.hpp b/cpp/tests/large_strings/large_strings_fixture.hpp
new file mode 100644
index 00000000000..8827b65f1ce
--- /dev/null
+++ b/cpp/tests/large_strings/large_strings_fixture.hpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+
+#include <cudf/column/column_view.hpp>
+
+namespace cudf::test {
+class LargeStringsData;
+
+/**
+ * @brief Fixture for creating large strings tests
+ *
+ * Stores tests strings columns for reuse by specific tests.
+ * Creating the test input only once helps speed up the overall tests.
+ *
+ * Also automatically enables appropriate large strings environment variables.
+ */
+struct StringsLargeTest : public cudf::test::BaseFixture {
+  /**
+   * @brief Returns a column of long strings
+   */
+  cudf::column_view wide_column();
+
+  /**
+   * @brief Returns a long column of strings
+   */
+  cudf::column_view long_column();
+
+  large_strings_enabler g_ls_enabler;
+  static LargeStringsData* g_ls_data;
+
+  static std::unique_ptr<LargeStringsData> get_ls_data();
+};
+}  // namespace cudf::test
diff --git a/cpp/tests/large_strings/merge_tests.cpp b/cpp/tests/large_strings/merge_tests.cpp
new file mode 100644
index 00000000000..afe6e424371
--- /dev/null
+++ b/cpp/tests/large_strings/merge_tests.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+
+#include <cudf/copying.hpp>
+#include <cudf/merge.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <vector>
+
+struct MergeTest : public cudf::test::StringsLargeTest {};
+
+TEST_F(MergeTest, MergeLargeStrings)
+{
+  auto const input = this->long_column();
+  auto input_views = std::vector<cudf::table_view>();
+  auto const view  = cudf::table_view({input});
+  std::vector<cudf::size_type> splits;
+  int const multiplier = 10;
+  for (int i = 0; i < multiplier; ++i) {  // 2500MB > 2GB
+    input_views.push_back(view);
+    splits.push_back(view.num_rows() * (i + 1));
+  }
+  splits.pop_back();  // remove last entry
+  auto const column_order    = std::vector<cudf::order>{cudf::order::ASCENDING};
+  auto const null_precedence = std::vector<cudf::null_order>{cudf::null_order::AFTER};
+
+  auto result = cudf::merge(input_views, {0}, column_order, null_precedence);
+  auto sv     = cudf::strings_column_view(result->view().column(0));
+  EXPECT_EQ(sv.size(), view.num_rows() * multiplier);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+
+  auto sliced = cudf::split(sv.parent(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+
+  // also test with large strings column as input
+  input_views.clear();
+  input_views.push_back(view);            // regular column
+  input_views.push_back(result->view());  // large column
+  result = cudf::merge(input_views, {0}, column_order, null_precedence);
+  sv     = cudf::strings_column_view(result->view().column(0));
+  EXPECT_EQ(sv.size(), view.num_rows() * (multiplier + 1));
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+  splits.push_back(view.num_rows() * multiplier);
+  sliced = cudf::split(sv.parent(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+
+  // also check merge still returns 32-bit offsets for regular columns
+  input_views.clear();
+  input_views.push_back(view);
+  input_views.push_back(view);
+  result = cudf::merge(input_views, {0}, column_order, null_precedence);
+  sv     = cudf::strings_column_view(result->view().column(0));
+  EXPECT_EQ(sv.size(), view.num_rows() * 2);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT32});
+  sliced = cudf::split(sv.parent(), {view.num_rows()});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[0], input);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[1], input);
+}
diff --git a/cpp/tests/merge/merge_string_test.cpp b/cpp/tests/merge/merge_string_test.cpp
index d7368d31944..28179a7341c 100644
--- a/cpp/tests/merge/merge_string_test.cpp
+++ b/cpp/tests/merge/merge_string_test.cpp
@@ -411,60 +411,3 @@ TYPED_TEST(MergeStringTest, Merge2StringKeyNullColumns)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view2, output_column_view2);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view3, output_column_view3);
 }
-
-class MergeLargeStringsTest : public cudf::test::BaseFixture {};
-
-TEST_F(MergeLargeStringsTest, MergeLargeStrings)
-{
-  CUDF_TEST_ENABLE_LARGE_STRINGS();
-  auto itr = thrust::constant_iterator<std::string_view>(
-    "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY");                      // 50 bytes
-  auto const input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000);  // 250MB
-  auto input_views = std::vector<cudf::table_view>();
-  auto const view  = cudf::table_view({input});
-  std::vector<cudf::size_type> splits;
-  int const multiplier = 10;
-  for (int i = 0; i < multiplier; ++i) {  // 2500MB > 2GB
-    input_views.push_back(view);
-    splits.push_back(view.num_rows() * (i + 1));
-  }
-  splits.pop_back();  // remove last entry
-  auto const column_order    = std::vector<cudf::order>{cudf::order::ASCENDING};
-  auto const null_precedence = std::vector<cudf::null_order>{cudf::null_order::AFTER};
-
-  auto result = cudf::merge(input_views, {0}, column_order, null_precedence);
-  auto sv     = cudf::strings_column_view(result->view().column(0));
-  EXPECT_EQ(sv.size(), view.num_rows() * multiplier);
-  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
-
-  auto sliced = cudf::split(sv.parent(), splits);
-  for (auto c : sliced) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
-  }
-
-  // also test with large strings column as input
-  input_views.clear();
-  input_views.push_back(view);            // regular column
-  input_views.push_back(result->view());  // large column
-  result = cudf::merge(input_views, {0}, column_order, null_precedence);
-  sv     = cudf::strings_column_view(result->view().column(0));
-  EXPECT_EQ(sv.size(), view.num_rows() * (multiplier + 1));
-  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
-  splits.push_back(view.num_rows() * multiplier);
-  sliced = cudf::split(sv.parent(), splits);
-  for (auto c : sliced) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
-  }
-
-  // also check merge still returns 32-bit offsets for regular columns
-  input_views.clear();
-  input_views.push_back(view);
-  input_views.push_back(view);
-  result = cudf::merge(input_views, {0}, column_order, null_precedence);
-  sv     = cudf::strings_column_view(result->view().column(0));
-  EXPECT_EQ(sv.size(), view.num_rows() * 2);
-  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT32});
-  sliced = cudf::split(sv.parent(), {view.num_rows()});
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[0], input);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[1], input);
-}

From 8b4dc91fbee585e0f03cccc2b60ce7b68baa9a5f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 24 Apr 2024 10:53:36 -1000
Subject: [PATCH 123/272] Replace RangeIndex._start/_stop/_step with _range
 (#15576)

The `._start/_stop/_step` attributes are wholly redundant with the similar attributes on a `range` object, so replacing with those attributes where needed

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15576
---
 python/cudf/cudf/core/index.py       | 128 +++++++++++----------------
 python/cudf/cudf/tests/test_index.py |   2 +-
 2 files changed, 55 insertions(+), 75 deletions(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 6f08b1d83b3..e457e818129 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -36,7 +36,6 @@
     is_integer,
     is_list_like,
     is_scalar,
-    is_signed_integer_dtype,
 )
 from cudf.core._base_index import BaseIndex
 from cudf.core._compat import PANDAS_LT_300
@@ -149,6 +148,15 @@ def _index_from_data(data: MutableMapping, name: Any = no_default):
     return index_class_type._from_data(data, name)
 
 
+def validate_range_arg(arg, arg_name: Literal["start", "stop", "step"]) -> int:
+    """Validate start/stop/step argument in RangeIndex.__init__"""
+    if not is_integer(arg):
+        raise TypeError(
+            f"{arg_name} must be an integer, not {type(arg).__name__}"
+        )
+    return int(arg)
+
+
 class RangeIndex(BaseIndex, BinaryOperand):
     """
     Immutable Index implementing a monotonic integer range.
@@ -197,44 +205,29 @@ class RangeIndex(BaseIndex, BinaryOperand):
     def __init__(
         self, start, stop=None, step=1, dtype=None, copy=False, name=None
     ):
-        if step == 0:
-            raise ValueError("Step must not be zero.")
         if not cudf.api.types.is_hashable(name):
             raise ValueError("Name must be a hashable value.")
-        if dtype is not None and not is_signed_integer_dtype(dtype):
+        self._name = name
+        if dtype is not None and cudf.dtype(dtype).kind != "i":
             raise ValueError(f"{dtype=} must be a signed integer type")
 
         if isinstance(start, range):
-            therange = start
-            start = therange.start
-            stop = therange.stop
-            step = therange.step
-        if stop is None:
-            start, stop = 0, start
-        if not is_integer(start):
-            raise TypeError(
-                f"start must be an integer, not {type(start).__name__}"
-            )
-        self._start = int(start)
-        if not is_integer(stop):
-            raise TypeError(
-                f"stop must be an integer, not {type(stop).__name__}"
-            )
-        self._stop = int(stop)
-        if step is not None:
-            if not is_integer(step):
-                raise TypeError(
-                    f"step must be an integer, not {type(step).__name__}"
-                )
-            self._step = int(step)
+            self._range = start
         else:
-            self._step = 1
-        self._index = None
-        self._name = name
-        self._range = range(self._start, self._stop, self._step)
-        # _end is the actual last element of RangeIndex,
-        # whereas _stop is an upper bound.
-        self._end = self._start + self._step * (len(self._range) - 1)
+            if stop is None:
+                start, stop = 0, start
+            start = validate_range_arg(start, "start")
+            stop = validate_range_arg(stop, "stop")
+            if step is not None:
+                step = validate_range_arg(step, "step")
+            else:
+                step = 1
+            try:
+                self._range = range(start, stop, step)
+            except ValueError as err:
+                if step == 0:
+                    raise ValueError("Step must not be zero.") from err
+                raise
 
     def _copy_type_metadata(
         self, other: RangeIndex, *, override_dtypes=None
@@ -251,9 +244,9 @@ def searchsorted(
         na_position: Literal["first", "last"] = "last",
     ):
         assert (len(self) <= 1) or (
-            ascending == (self._step > 0)
+            ascending == (self.step > 0)
         ), "Invalid ascending flag"
-        return search_range(value, self.as_range, side=side)
+        return search_range(value, self._range, side=side)
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
@@ -271,7 +264,7 @@ def start(self):
         """
         The value of the `start` parameter (0 if this was not supplied).
         """
-        return self._start
+        return self._range.start
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
@@ -279,7 +272,7 @@ def stop(self):
         """
         The value of the stop parameter.
         """
-        return self._stop
+        return self._range.stop
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
@@ -287,7 +280,7 @@ def step(self):
         """
         The value of the step parameter.
         """
-        return self._step
+        return self._range.step
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
@@ -368,9 +361,7 @@ def copy(self, name=None, deep=False):
 
         name = self.name if name is None else name
 
-        return RangeIndex(
-            start=self._start, stop=self._stop, step=self._step, name=name
-        )
+        return RangeIndex(self._range, name=name)
 
     @_cudf_nvtx_annotate
     def astype(self, dtype, copy: bool = True):
@@ -389,8 +380,8 @@ def duplicated(self, keep="first"):
     @_cudf_nvtx_annotate
     def __repr__(self):
         return (
-            f"{self.__class__.__name__}(start={self._start}, stop={self._stop}"
-            f", step={self._step}"
+            f"{self.__class__.__name__}(start={self.start}, stop={self.stop}"
+            f", step={self.step}"
             + (
                 f", name={pd.io.formats.printing.default_pprint(self.name)}"
                 if self.name is not None
@@ -401,16 +392,16 @@ def __repr__(self):
 
     @_cudf_nvtx_annotate
     def __len__(self):
-        return len(range(self._start, self._stop, self._step))
+        return len(self._range)
 
     @_cudf_nvtx_annotate
     def __getitem__(self, index):
         if isinstance(index, slice):
             sl_start, sl_stop, sl_step = index.indices(len(self))
 
-            lo = self._start + sl_start * self._step
-            hi = self._start + sl_stop * self._step
-            st = self._step * sl_step
+            lo = self.start + sl_start * self.step
+            hi = self.start + sl_stop * self.step
+            st = self.step * sl_step
             return RangeIndex(start=lo, stop=hi, step=st, name=self._name)
 
         elif isinstance(index, Number):
@@ -419,18 +410,13 @@ def __getitem__(self, index):
                 index += len_self
             if not (0 <= index < len_self):
                 raise IndexError("Index out of bounds")
-            return self._start + index * self._step
+            return self.start + index * self.step
         return self._as_int_index()[index]
 
     @_cudf_nvtx_annotate
     def equals(self, other):
         if isinstance(other, RangeIndex):
-            if (self._start, self._stop, self._step) == (
-                other._start,
-                other._stop,
-                other._step,
-            ):
-                return True
+            return self._range == other._range
         return self._as_int_index().equals(other)
 
     @_cudf_nvtx_annotate
@@ -442,9 +428,9 @@ def serialize(self):
         # We don't need to store the GPU buffer for RangeIndexes
         # cuDF only needs to store start/stop and rehydrate
         # during de-serialization
-        header["index_column"]["start"] = self._start
-        header["index_column"]["stop"] = self._stop
-        header["index_column"]["step"] = self._step
+        header["index_column"]["start"] = self.start
+        header["index_column"]["stop"] = self.stop
+        header["index_column"]["step"] = self.step
         frames = []
 
         header["name"] = pickle.dumps(self.name)
@@ -484,9 +470,9 @@ def to_pandas(
         elif arrow_type:
             raise NotImplementedError(f"{arrow_type=} is not implemented.")
         return pd.RangeIndex(
-            start=self._start,
-            stop=self._stop,
-            step=self._step,
+            start=self.start,
+            stop=self.stop,
+            step=self.step,
             dtype=self.dtype,
             name=self.name,
         )
@@ -495,19 +481,15 @@ def to_pandas(
     def is_unique(self):
         return True
 
-    @cached_property
-    def as_range(self):
-        return range(self._start, self._stop, self._step)
-
     @cached_property  # type: ignore
     @_cudf_nvtx_annotate
     def is_monotonic_increasing(self):
-        return self._step > 0 or len(self) <= 1
+        return self.step > 0 or len(self) <= 1
 
     @cached_property  # type: ignore
     @_cudf_nvtx_annotate
     def is_monotonic_decreasing(self):
-        return self._step < 0 or len(self) <= 1
+        return self.step < 0 or len(self) <= 1
 
     @_cudf_nvtx_annotate
     def memory_usage(self, deep=False):
@@ -590,12 +572,12 @@ def get_indexer(self, target, limit=None, method=None, tolerance=None):
     def get_loc(self, key):
         if not is_scalar(key):
             raise TypeError("Should be a scalar-like")
-        idx = (key - self._start) / self._step
-        idx_int_upper_bound = (self._stop - self._start) // self._step
+        idx = (key - self.start) / self.step
+        idx_int_upper_bound = (self.stop - self.start) // self.step
         if idx > idx_int_upper_bound or idx < 0:
             raise KeyError(key)
 
-        idx_int = (key - self._start) // self._step
+        idx_int = (key - self.start) // self.step
         if idx_int != idx:
             raise KeyError(key)
         return idx_int
@@ -607,9 +589,9 @@ def _union(self, other, sort=None):
             # following notation: *_o -> other, *_s -> self,
             # and *_r -> result
             start_s, step_s = self.start, self.step
-            end_s = self._end
+            end_s = self.start + self.step * (len(self) - 1)
             start_o, step_o = other.start, other.step
-            end_o = other._end
+            end_o = other.start + other.step * (len(other) - 1)
             if self.step < 0:
                 start_s, step_s, end_s = end_s, -step_s, start_s
             if other.step < 0:
@@ -854,9 +836,7 @@ def argsort(
             raise ValueError(f"invalid na_position: {na_position}")
 
         indices = cupy.arange(0, len(self))
-        if (ascending and self._step < 0) or (
-            not ascending and self._step > 0
-        ):
+        if (ascending and self.step < 0) or (not ascending and self.step > 0):
             indices = indices[::-1]
         return indices
 
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index ebbca57bd40..08a7a9148dd 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -1606,7 +1606,7 @@ def test_rangeindex_name_not_hashable():
 def test_index_rangeindex_search_range():
     # step > 0
     ridx = RangeIndex(-13, 17, 4)
-    ri = ridx.as_range
+    ri = ridx._range
     for i in range(len(ridx)):
         assert i == search_range(ridx[i], ri, side="left")
         assert i + 1 == search_range(ridx[i], ri, side="right")

From 70a5b2bda500fe46cd14860b4e2ca0109893c434 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 24 Apr 2024 13:40:03 -1000
Subject: [PATCH 124/272] Don't materialize column during RangeIndex methods
 (#15582)

Additionally implements some methods that are defined on `BaseIndex` that were not implemented on `RangeIndex` and adds some typing

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15582
---
 python/cudf/cudf/core/_base_index.py |  10 ++-
 python/cudf/cudf/core/index.py       | 108 +++++++++++++++++----------
 python/cudf/cudf/tests/test_index.py |  23 ++++++
 3 files changed, 100 insertions(+), 41 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index de44f392eef..b5630ff9a54 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -517,7 +517,7 @@ def where(self, cond, other=None, inplace=False):
         """
         raise NotImplementedError
 
-    def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None):
+    def factorize(self, sort: bool = False, use_na_sentinel: bool = True):
         raise NotImplementedError
 
     def union(self, other, sort=None):
@@ -2061,7 +2061,13 @@ def dropna(self, how="any"):
             one null value. "all" drops only rows containing
             *all* null values.
         """
-
+        if how not in {"any", "all"}:
+            raise ValueError(f"{how=} must be 'any' or 'all'")
+        try:
+            if not self.hasnans:
+                return self.copy()
+        except NotImplementedError:
+            pass
         # This is to be consistent with IndexedFrame.dropna to handle nans
         # as nulls by default
         data_columns = [
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index e457e818129..6c0acdc5fb0 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -21,6 +21,7 @@
 import cupy
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from typing_extensions import Self
 
 import cudf
@@ -248,6 +249,15 @@ def searchsorted(
         ), "Invalid ascending flag"
         return search_range(value, self._range, side=side)
 
+    def factorize(self, sort: bool = False, use_na_sentinel: bool = True):
+        if sort and self.step < 0:
+            codes = cupy.arange(len(self) - 1, -1, -1)
+            uniques = self[::-1]
+        else:
+            codes = cupy.arange(len(self), dtype=np.intp)
+            uniques = self
+        return codes, uniques
+
     @property  # type: ignore
     @_cudf_nvtx_annotate
     def name(self):
@@ -260,7 +270,7 @@ def name(self, value):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def start(self):
+    def start(self) -> int:
         """
         The value of the `start` parameter (0 if this was not supplied).
         """
@@ -268,7 +278,7 @@ def start(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def stop(self):
+    def stop(self) -> int:
         """
         The value of the stop parameter.
         """
@@ -276,7 +286,7 @@ def stop(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def step(self):
+    def step(self) -> int:
         """
         The value of the step parameter.
         """
@@ -284,7 +294,7 @@ def step(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def _num_rows(self):
+    def _num_rows(self) -> int:
         return len(self)
 
     @cached_property  # type: ignore
@@ -295,33 +305,33 @@ def _values(self):
         else:
             return column.column_empty(0, masked=False, dtype=self.dtype)
 
-    def _clean_nulls_from_index(self):
+    def _clean_nulls_from_index(self) -> Self:
         return self
 
-    def _is_numeric(self):
+    def _is_numeric(self) -> bool:
         return True
 
-    def _is_boolean(self):
+    def _is_boolean(self) -> bool:
         return False
 
-    def _is_integer(self):
+    def _is_integer(self) -> bool:
         return True
 
-    def _is_floating(self):
+    def _is_floating(self) -> bool:
         return False
 
-    def _is_object(self):
+    def _is_object(self) -> bool:
         return False
 
-    def _is_categorical(self):
+    def _is_categorical(self) -> bool:
         return False
 
-    def _is_interval(self):
+    def _is_interval(self) -> bool:
         return False
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def hasnans(self):
+    def hasnans(self) -> bool:
         return False
 
     @property  # type: ignore
@@ -369,12 +379,15 @@ def astype(self, dtype, copy: bool = True):
             return self
         return self._as_int_index().astype(dtype, copy=copy)
 
+    def fillna(self, value, downcast=None):
+        return self.copy()
+
     @_cudf_nvtx_annotate
     def drop_duplicates(self, keep="first"):
         return self
 
     @_cudf_nvtx_annotate
-    def duplicated(self, keep="first"):
+    def duplicated(self, keep="first") -> cupy.ndarray:
         return cupy.zeros(len(self), dtype=bool)
 
     @_cudf_nvtx_annotate
@@ -390,6 +403,11 @@ def __repr__(self):
             + ")"
         )
 
+    @property
+    @_cudf_nvtx_annotate
+    def size(self) -> int:
+        return len(self)
+
     @_cudf_nvtx_annotate
     def __len__(self):
         return len(self._range)
@@ -478,12 +496,12 @@ def to_pandas(
         )
 
     @property
-    def is_unique(self):
+    def is_unique(self) -> bool:
         return True
 
     @cached_property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_monotonic_increasing(self):
+    def is_monotonic_increasing(self) -> bool:
         return self.step > 0 or len(self) <= 1
 
     @cached_property  # type: ignore
@@ -492,7 +510,7 @@ def is_monotonic_decreasing(self):
         return self.step < 0 or len(self) <= 1
 
     @_cudf_nvtx_annotate
-    def memory_usage(self, deep=False):
+    def memory_usage(self, deep: bool = False) -> int:
         if deep:
             warnings.warn(
                 "The deep parameter is ignored and is only included "
@@ -500,7 +518,7 @@ def memory_usage(self, deep=False):
             )
         return 0
 
-    def unique(self):
+    def unique(self) -> Self:
         # RangeIndex always has unique values
         return self
 
@@ -823,34 +841,37 @@ def _columns(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def values_host(self):
-        return self.to_pandas().values
+    def values_host(self) -> np.ndarray:
+        return np.arange(start=self.start, stop=self.stop, step=self.step)
 
     @_cudf_nvtx_annotate
     def argsort(
         self,
         ascending=True,
         na_position="last",
-    ):
+    ) -> cupy.ndarray:
         if na_position not in {"first", "last"}:
             raise ValueError(f"invalid na_position: {na_position}")
-
-        indices = cupy.arange(0, len(self))
         if (ascending and self.step < 0) or (not ascending and self.step > 0):
-            indices = indices[::-1]
-        return indices
+            return cupy.arange(len(self) - 1, -1, -1)
+        else:
+            return cupy.arange(len(self))
 
     @_cudf_nvtx_annotate
     def where(self, cond, other=None, inplace=False):
         return self._as_int_index().where(cond, other, inplace)
 
     @_cudf_nvtx_annotate
-    def to_numpy(self):
+    def to_numpy(self) -> np.ndarray:
         return self.values_host
 
     @_cudf_nvtx_annotate
-    def to_arrow(self):
-        return self._as_int_index().to_arrow()
+    def to_cupy(self) -> cupy.ndarray:
+        return self.values
+
+    @_cudf_nvtx_annotate
+    def to_arrow(self) -> pa.Array:
+        return pa.array(self._range, type=pa.from_numpy_dtype(self.dtype))
 
     def __array__(self, dtype=None):
         raise TypeError(
@@ -861,17 +882,17 @@ def __array__(self, dtype=None):
         )
 
     @_cudf_nvtx_annotate
-    def nunique(self):
+    def nunique(self) -> int:
         return len(self)
 
     @_cudf_nvtx_annotate
-    def isna(self):
+    def isna(self) -> cupy.ndarray:
         return cupy.zeros(len(self), dtype=bool)
 
     isnull = isna
 
     @_cudf_nvtx_annotate
-    def notna(self):
+    def notna(self) -> cupy.ndarray:
         return cupy.ones(len(self), dtype=bool)
 
     notnull = isna
@@ -895,12 +916,15 @@ def max(self):
         return self._minmax("max")
 
     @property
-    def values(self):
+    def values(self) -> cupy.ndarray:
         return cupy.arange(self.start, self.stop, self.step)
 
-    def any(self):
+    def any(self) -> bool:
         return any(self._range)
 
+    def all(self) -> bool:
+        return 0 not in self._range
+
     def append(self, other):
         result = self._as_int_index().append(other)
         return self._try_reconstruct_range_index(result)
@@ -926,14 +950,20 @@ def isin(self, values):
 
         return self._values.isin(values).values
 
-    def __neg__(self):
-        return -self._as_int_index()
+    def __pos__(self) -> Self:
+        return self.copy()
 
-    def __pos__(self):
-        return +self._as_int_index()
+    def __neg__(self) -> Self:
+        rng = range(-self.start, -self.stop, -self.step)
+        return type(self)(rng, name=self.name)
 
-    def __abs__(self):
-        return abs(self._as_int_index())
+    def __abs__(self) -> Self | Index:
+        if len(self) == 0 or self.min() >= 0:
+            return self.copy()
+        elif self.max() <= 0:
+            return -self
+        else:
+            return abs(self._as_int_index())
 
     @_warn_no_dask_cudf
     def __dask_tokenize__(self):
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 08a7a9148dd..c7875b81440 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -3176,3 +3176,26 @@ def test_index_to_pandas_arrow_type(scalar):
     result = idx.to_pandas(arrow_type=True)
     expected = pd.Index(pd.arrays.ArrowExtensionArray(pa_array))
     pd.testing.assert_index_equal(result, expected)
+
+
+@pytest.mark.parametrize("data", [range(-3, 3), range(1, 3), range(0)])
+def test_rangeindex_all(data):
+    result = cudf.RangeIndex(data).all()
+    expected = cudf.Index(list(data)).all()
+    assert result == expected
+
+
+@pytest.mark.parametrize("sort", [True, False])
+@pytest.mark.parametrize("data", [range(2), range(2, -1, -1)])
+def test_rangeindex_factorize(sort, data):
+    res_codes, res_uniques = cudf.RangeIndex(data).factorize(sort=sort)
+    exp_codes, exp_uniques = cudf.Index(list(data)).factorize(sort=sort)
+    assert_eq(res_codes, exp_codes)
+    assert_eq(res_uniques, exp_uniques)
+
+
+def test_rangeindex_dropna():
+    ri = cudf.RangeIndex(range(2))
+    result = ri.dropna()
+    expected = ri.copy()
+    assert_eq(result, expected)

From 4dc9ebbfe5b2a22949c5f24114918e4369d055cd Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 25 Apr 2024 08:53:11 -0400
Subject: [PATCH 125/272] Improve performance for cudf::strings::count_re
 (#15578)

Improves performance of `cudf::strings::count_re` when pattern starts with a literal character.
Although this is a specific use case, the regex code has special logic to help speed up the search in this case.

Since the pattern indicates the target must contain this character as the start of the matching sequence, it first does a normal find for the character before continuing matching the remaining pattern. The `find()` function can be inefficient for long strings since it is character based and must resolve the character's byte-position by counting from the beginning of the string. For a function like `count_re()` all occurrences are matched within a target meaning longer target strings can incur expensive counting.

The solution included here is to introduce a more efficient `find_char()` utility that accepts a `string_view::const_iterator()` which automatically keeps track of its byte and character positions. This helps minimize byte/character counting in between calls from `count_re()` and other similar functions that make repeated calls for all matches (e.g. `replace_re()` and `split_re()`).

Close #15567

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15578
---
 cpp/benchmarks/string/contains.cpp |  4 ++--
 cpp/benchmarks/string/count.cpp    | 12 ++++++++----
 cpp/src/strings/regex/regex.inl    | 19 ++++++++++++++-----
 3 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp
index 6d839c1de64..ae6c8b844c8 100644
--- a/cpp/benchmarks/string/contains.cpp
+++ b/cpp/benchmarks/string/contains.cpp
@@ -80,7 +80,7 @@ std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
 }
 
 // longer pattern lengths demand more working memory per string
-std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$"};
+std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43"};
 
 static void bench_contains(nvbench::state& state)
 {
@@ -114,4 +114,4 @@ NVBENCH_BENCH(bench_contains)
   .add_int64_axis("row_width", {32, 64, 128, 256, 512})
   .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
   .add_int64_axis("hit_rate", {50, 100})  // percentage
-  .add_int64_axis("pattern", {0, 1});
+  .add_int64_axis("pattern", {0, 1, 2});
diff --git a/cpp/benchmarks/string/count.cpp b/cpp/benchmarks/string/count.cpp
index a656010dca5..f964bc5d224 100644
--- a/cpp/benchmarks/string/count.cpp
+++ b/cpp/benchmarks/string/count.cpp
@@ -25,10 +25,13 @@
 
 #include <nvbench/nvbench.cuh>
 
+static std::string patterns[] = {"\\d+", "a"};
+
 static void bench_count(nvbench::state& state)
 {
-  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const num_rows      = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width     = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const pattern_index = static_cast<cudf::size_type>(state.get_int64("pattern"));
 
   if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
       static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
@@ -41,7 +44,7 @@ static void bench_count(nvbench::state& state)
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
 
-  std::string pattern = "\\d+";
+  auto const pattern = patterns[pattern_index];
 
   auto prog = cudf::strings::regex_program::create(pattern);
 
@@ -59,4 +62,5 @@ static void bench_count(nvbench::state& state)
 NVBENCH_BENCH(bench_count)
   .set_name("count")
   .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("pattern", {0, 1});
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index ce12dc17aa4..10e06505094 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -217,6 +217,15 @@ __device__ __forceinline__ reprog_device reprog_device::load(reprog_device const
                                             : reinterpret_cast<reprog_device*>(buffer)[0];
 }
 
+__device__ __forceinline__ static string_view::const_iterator find_char(
+  cudf::char_utf8 chr, string_view const d_str, string_view::const_iterator itr)
+{
+  while (itr.byte_offset() < d_str.size_bytes() && *itr != chr) {
+    ++itr;
+  }
+  return itr;
+}
+
 /**
  * @brief Evaluate a specific string against regex pattern compiled to this instance.
  *
@@ -253,16 +262,16 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
         case BOL:
           if (pos == 0) break;
           if (jnk.startchar != '^') { return thrust::nullopt; }
-          --pos;
+          --itr;
           startchar = static_cast<char_utf8>('\n');
         case CHAR: {
-          auto const fidx = dstr.find(startchar, pos);
-          if (fidx == string_view::npos) { return thrust::nullopt; }
-          pos = fidx + (jnk.starttype == BOL);
+          auto const find_itr = find_char(startchar, dstr, itr);
+          if (find_itr.byte_offset() >= dstr.size_bytes()) { return thrust::nullopt; }
+          itr = find_itr + (jnk.starttype == BOL);
+          pos = itr.position();
           break;
         }
       }
-      itr += (pos - itr.position());  // faster to increment position
     }
 
     if (((eos < 0) || (pos < eos)) && match == 0) {

From 65c2b53602d70f7f50c7dd7544ca0fd07ac8b455 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 25 Apr 2024 15:12:01 -0400
Subject: [PATCH 126/272] Fix debug warnings/errors in
 from_arrow_device_test.cpp (#15596)

Fixes debug build errors introduced by #15458

These warnings show up in a debug build where warnings become errors.
Some of the errors:
```
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:103:27: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetTypeStruct(ArrowSchema*, int64_t)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  103 |   ArrowSchemaSetTypeStruct(input_schema.get(), 1);
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:105:29: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetTypeDateTime(ArrowSchema*, ArrowType, ArrowTimeUnit, const char*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  105 |   ArrowSchemaSetTypeDateTime(
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:107:21: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetName(ArrowSchema*, const char*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  107 |   ArrowSchemaSetName(input_schema->children[0], "a");
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:110:27: error: ignoring return value of 'ArrowErrorCode cudfArrowArrayInitFromSchema(ArrowArray*, const ArrowSchema*, ArrowError*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  110 |   ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:115:26: error: ignoring return value of 'ArrowErrorCode ArrowBufferSetAllocator(ArrowBuffer*, ArrowBufferAllocator)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  115 |   ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc);
      |   ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:118:27: error: ignoring return value of 'ArrowErrorCode cudfArrowArrayFinishBuilding(ArrowArray*, ArrowValidationLevel, ArrowError*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  118 |   ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr);
/cudf/cpp/tests/interop/from_arrow_device_test.cpp: In member function 'virtual void FromArrowDeviceTest_NestedList_Test::TestBody()':
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:202:27: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetTypeStruct(ArrowSchema*, int64_t)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  202 |   ArrowSchemaSetTypeStruct(input_schema.get(), 1);
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:204:26: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaInitFromType(ArrowSchema*, ArrowType)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  204 |   ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_LIST);
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:205:21: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetName(ArrowSchema*, const char*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  205 |   ArrowSchemaSetName(input_schema->children[0], "a");
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:208:26: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaInitFromType(ArrowSchema*, ArrowType)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  208 |   ArrowSchemaInitFromType(input_schema->children[0]->children[0], NANOARROW_TYPE_LIST);
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:209:21: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetName(ArrowSchema*, const char*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  209 |   ArrowSchemaSetName(input_schema->children[0]->children[0], "element");
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:212:26: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaInitFromType(ArrowSchema*, ArrowType)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  212 |   ArrowSchemaInitFromType(input_schema->children[0]->children[0]->children[0],
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:214:21: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetName(ArrowSchema*, const char*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  214 |   ArrowSchemaSetName(input_schema->children[0]->children[0]->children[0], "element");
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:226:27: error: ignoring return value of 'ArrowErrorCode cudfArrowArrayFinishBuilding(ArrowArray*, ArrowValidationLevel, ArrowError*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  226 |   ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
/cudf/cpp/tests/interop/from_arrow_device_test.cpp: In member function 'virtual void FromArrowDeviceTest_StructColumn_Test::TestBody()':

```

Closes #15597

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15596
---
 cpp/tests/interop/from_arrow_device_test.cpp | 229 +++++++++++--------
 cpp/tests/interop/nanoarrow_utils.hpp        |   4 +-
 cpp/tests/interop/to_arrow_device_test.cpp   |   7 +-
 3 files changed, 135 insertions(+), 105 deletions(-)

diff --git a/cpp/tests/interop/from_arrow_device_test.cpp b/cpp/tests/interop/from_arrow_device_test.cpp
index 95cbe8057d1..66bd4dd1bfb 100644
--- a/cpp/tests/interop/from_arrow_device_test.cpp
+++ b/cpp/tests/interop/from_arrow_device_test.cpp
@@ -100,22 +100,26 @@ TEST_F(FromArrowDeviceTest, DateTimeTable)
 
   nanoarrow::UniqueSchema input_schema;
   ArrowSchemaInit(input_schema.get());
-  ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
   ArrowSchemaInit(input_schema->children[0]);
-  ArrowSchemaSetTypeDateTime(
-    input_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr);
-  ArrowSchemaSetName(input_schema->children[0], "a");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(
+    input_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
 
   nanoarrow::UniqueArray input_array;
-  ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
   input_array->length                  = 6;
   input_array->null_count              = 0;
   input_array->children[0]->length     = 6;
   input_array->children[0]->null_count = 0;
-  ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc);
+  NANOARROW_THROW_NOT_OK(
+    ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc));
   ArrowArrayBuffer(input_array->children[0], 1)->data =
     const_cast<uint8_t*>(cudf::column_view(col).data<uint8_t>());
-  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr);
+  ArrowArrayBuffer(input_array->children[0], 1)->size_bytes =
+    sizeof(int64_t) * cudf::column_view(col).size();
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr));
 
   ArrowDeviceArray input_device_array;
   input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -155,23 +159,27 @@ TYPED_TEST(FromArrowDeviceTestDurationsTest, DurationTable)
 
   nanoarrow::UniqueSchema input_schema;
   ArrowSchemaInit(input_schema.get());
-  ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
 
   ArrowSchemaInit(input_schema->children[0]);
-  ArrowSchemaSetTypeDateTime(
-    input_schema->children[0], NANOARROW_TYPE_DURATION, time_unit, nullptr);
-  ArrowSchemaSetName(input_schema->children[0], "a");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(
+    input_schema->children[0], NANOARROW_TYPE_DURATION, time_unit, nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
 
-  auto data_ptr = expected_table_view.column(0).data<uint8_t>();
+  auto data_ptr  = expected_table_view.column(0).data<uint8_t>();
+  auto data_size = expected_table_view.column(0).size();
   nanoarrow::UniqueArray input_array;
-  ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
   input_array->length                  = expected_table_view.num_rows();
   input_array->null_count              = 0;
   input_array->children[0]->length     = expected_table_view.num_rows();
   input_array->children[0]->null_count = 0;
-  ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc);
-  ArrowArrayBuffer(input_array->children[0], 1)->data = const_cast<uint8_t*>(data_ptr);
-  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr);
+  NANOARROW_THROW_NOT_OK(
+    ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc));
+  ArrowArrayBuffer(input_array->children[0], 1)->data       = const_cast<uint8_t*>(data_ptr);
+  ArrowArrayBuffer(input_array->children[0], 1)->size_bytes = sizeof(T) * data_size;
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr));
 
   ArrowDeviceArray input_device_array;
   input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -199,19 +207,21 @@ TEST_F(FromArrowDeviceTest, NestedList)
 
   nanoarrow::UniqueSchema input_schema;
   ArrowSchemaInit(input_schema.get());
-  ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
 
-  ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_LIST);
-  ArrowSchemaSetName(input_schema->children[0], "a");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
   input_schema->children[0]->flags = ARROW_FLAG_NULLABLE;
 
-  ArrowSchemaInitFromType(input_schema->children[0]->children[0], NANOARROW_TYPE_LIST);
-  ArrowSchemaSetName(input_schema->children[0]->children[0], "element");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(input_schema->children[0]->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0]->children[0], "element"));
   input_schema->children[0]->children[0]->flags = 0;
 
-  ArrowSchemaInitFromType(input_schema->children[0]->children[0]->children[0],
-                          NANOARROW_TYPE_INT64);
-  ArrowSchemaSetName(input_schema->children[0]->children[0]->children[0], "element");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(
+    input_schema->children[0]->children[0]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaSetName(input_schema->children[0]->children[0]->children[0], "element"));
   input_schema->children[0]->children[0]->children[0]->flags = ARROW_FLAG_NULLABLE;
 
   nanoarrow::UniqueArray input_array;
@@ -223,7 +233,8 @@ TEST_F(FromArrowDeviceTest, NestedList)
   cudf::lists_column_view nested_view{lview.child()};
   populate_list_from_col(top_list->children[0], nested_view);
   populate_from_col<int64_t>(top_list->children[0]->children[0], nested_view.child());
-  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
   ArrowDeviceArray input_device_array;
   input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -287,47 +298,52 @@ TEST_F(FromArrowDeviceTest, StructColumn)
 
   nanoarrow::UniqueSchema input_schema;
   ArrowSchemaInit(input_schema.get());
-  ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
 
   ArrowSchemaInit(input_schema->children[0]);
-  ArrowSchemaSetTypeStruct(input_schema->children[0], 5);
-  ArrowSchemaSetName(input_schema->children[0], "a");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema->children[0], 5));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
   input_schema->children[0]->flags = 0;
 
   auto child = input_schema->children[0];
-  ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING);
-  ArrowSchemaSetName(child->children[0], "string");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[0], "string"));
   child->children[0]->flags = 0;
 
-  ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32);
-  ArrowSchemaSetName(child->children[1], "integral");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[1], "integral"));
   child->children[1]->flags = 0;
 
-  ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL);
-  ArrowSchemaSetName(child->children[2], "bool");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[2], "bool"));
   child->children[2]->flags = 0;
 
-  ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST);
-  ArrowSchemaSetName(child->children[3], "nested_list");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3], "nested_list"));
   child->children[3]->flags = 0;
-  ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST);
-  ArrowSchemaSetName(child->children[3]->children[0], "element");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3]->children[0], "element"));
   child->children[3]->children[0]->flags = 0;
-  ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64);
-  ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element"));
   child->children[3]->children[0]->children[0]->flags = 0;
 
   ArrowSchemaInit(child->children[4]);
-  ArrowSchemaSetTypeStruct(child->children[4], 2);
-  ArrowSchemaSetName(child->children[4], "struct");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(child->children[4], 2));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4], "struct"));
 
-  ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING);
-  ArrowSchemaSetName(child->children[4]->children[0], "string2");
-  ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32);
-  ArrowSchemaSetName(child->children[4]->children[1], "integral2");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[0], "string2"));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[1], "integral2"));
 
   nanoarrow::UniqueArray input_array;
-  ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
 
   input_array->length = expected_table_view.num_rows();
 
@@ -336,7 +352,7 @@ TEST_F(FromArrowDeviceTest, StructColumn)
   array_a->length     = view_a.size();
   array_a->null_count = view_a.null_count();
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(array_a, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(array_a, 0), noop_alloc));
   ArrowArrayValidityBitmap(array_a)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_a.null_mask()));
 
@@ -354,14 +370,15 @@ TEST_F(FromArrowDeviceTest, StructColumn)
   array_struct->length     = view_struct.size();
   array_struct->null_count = view_struct.null_count();
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(array_struct, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(array_struct, 0), noop_alloc));
   ArrowArrayValidityBitmap(array_struct)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_struct.null_mask()));
 
   populate_from_col<cudf::string_view>(array_struct->children[0], view_struct.child(0));
   populate_from_col<int32_t>(array_struct->children[1], view_struct.child(1));
 
-  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
   ArrowDeviceArray input_device_array;
   input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -406,25 +423,28 @@ TEST_F(FromArrowDeviceTest, DictionaryIndicesType)
 
   nanoarrow::UniqueSchema input_schema;
   ArrowSchemaInit(input_schema.get());
-  ArrowSchemaSetTypeStruct(input_schema.get(), 3);
-
-  ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_INT8);
-  ArrowSchemaSetName(input_schema->children[0], "a");
-  ArrowSchemaAllocateDictionary(input_schema->children[0]);
-  ArrowSchemaInitFromType(input_schema->children[0]->dictionary, NANOARROW_TYPE_INT64);
-
-  ArrowSchemaInitFromType(input_schema->children[1], NANOARROW_TYPE_INT16);
-  ArrowSchemaSetName(input_schema->children[1], "b");
-  ArrowSchemaAllocateDictionary(input_schema->children[1]);
-  ArrowSchemaInitFromType(input_schema->children[1]->dictionary, NANOARROW_TYPE_INT64);
-
-  ArrowSchemaInitFromType(input_schema->children[2], NANOARROW_TYPE_INT64);
-  ArrowSchemaSetName(input_schema->children[2], "c");
-  ArrowSchemaAllocateDictionary(input_schema->children[2]);
-  ArrowSchemaInitFromType(input_schema->children[2]->dictionary, NANOARROW_TYPE_INT64);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 3));
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_INT8));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[0]));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(input_schema->children[0]->dictionary, NANOARROW_TYPE_INT64));
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[1], NANOARROW_TYPE_INT16));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[1], "b"));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[1]));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(input_schema->children[1]->dictionary, NANOARROW_TYPE_INT64));
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[2], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[2], "c"));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[2]));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(input_schema->children[2]->dictionary, NANOARROW_TYPE_INT64));
 
   nanoarrow::UniqueArray input_array;
-  ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
   input_array->length     = expected_table.num_rows();
   input_array->null_count = 0;
 
@@ -446,7 +466,8 @@ TEST_F(FromArrowDeviceTest, DictionaryIndicesType)
   populate_from_col<int64_t>(input_array->children[2]->dictionary,
                              cudf::dictionary_column_view{expected_table_view.column(2)}.keys());
 
-  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
   ArrowDeviceArray input_device_array;
   input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -562,20 +583,22 @@ TEST_F(FromArrowDeviceTest, FixedPoint128Table)
 
     nanoarrow::UniqueSchema input_schema;
     ArrowSchemaInit(input_schema.get());
-    ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
     ArrowSchemaInit(input_schema->children[0]);
-    ArrowSchemaSetTypeDecimal(input_schema->children[0],
-                              NANOARROW_TYPE_DECIMAL128,
-                              cudf::detail::max_precision<__int128_t>(),
-                              -scale);
-    ArrowSchemaSetName(input_schema->children[0], "a");
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
 
     nanoarrow::UniqueArray input_array;
-    ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
     input_array->length = expected.num_rows();
 
     populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
-    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
     ArrowDeviceArray input_device_array;
     input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -607,20 +630,22 @@ TEST_F(FromArrowDeviceTest, FixedPoint128TableLarge)
 
     nanoarrow::UniqueSchema input_schema;
     ArrowSchemaInit(input_schema.get());
-    ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
     ArrowSchemaInit(input_schema->children[0]);
-    ArrowSchemaSetTypeDecimal(input_schema->children[0],
-                              NANOARROW_TYPE_DECIMAL128,
-                              cudf::detail::max_precision<__int128_t>(),
-                              -scale);
-    ArrowSchemaSetName(input_schema->children[0], "a");
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
 
     nanoarrow::UniqueArray input_array;
-    ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
     input_array->length = expected.num_rows();
 
     populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
-    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
     ArrowDeviceArray input_device_array;
     input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -652,20 +677,22 @@ TEST_F(FromArrowDeviceTest, FixedPoint128TableNulls)
 
     nanoarrow::UniqueSchema input_schema;
     ArrowSchemaInit(input_schema.get());
-    ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
     ArrowSchemaInit(input_schema->children[0]);
-    ArrowSchemaSetTypeDecimal(input_schema->children[0],
-                              NANOARROW_TYPE_DECIMAL128,
-                              cudf::detail::max_precision<__int128_t>(),
-                              -scale);
-    ArrowSchemaSetName(input_schema->children[0], "a");
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
 
     nanoarrow::UniqueArray input_array;
-    ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
     input_array->length = expected.num_rows();
 
     populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
-    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
     ArrowDeviceArray input_device_array;
     input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -699,20 +726,22 @@ TEST_F(FromArrowDeviceTest, FixedPoint128TableNullsLarge)
 
     nanoarrow::UniqueSchema input_schema;
     ArrowSchemaInit(input_schema.get());
-    ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
     ArrowSchemaInit(input_schema->children[0]);
-    ArrowSchemaSetTypeDecimal(input_schema->children[0],
-                              NANOARROW_TYPE_DECIMAL128,
-                              cudf::detail::max_precision<__int128_t>(),
-                              -scale);
-    ArrowSchemaSetName(input_schema->children[0], "a");
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
 
     nanoarrow::UniqueArray input_array;
-    ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
     input_array->length = expected.num_rows();
 
     populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
-    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
     ArrowDeviceArray input_device_array;
     input_device_array.device_id   = rmm::get_current_cuda_device().value();
diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp
index b795bafed97..fb5d1060f6f 100644
--- a/cpp/tests/interop/nanoarrow_utils.hpp
+++ b/cpp/tests/interop/nanoarrow_utils.hpp
@@ -122,13 +122,13 @@ void populate_dict_from_col(ArrowArray* arr, cudf::dictionary_column_view dview)
 {
   arr->length     = dview.size();
   arr->null_count = dview.null_count();
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
   ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
     cudf::bitmask_allocation_size_bytes(dview.size());
   ArrowArrayValidityBitmap(arr)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(dview.null_mask()));
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
   ArrowArrayBuffer(arr, 1)->size_bytes = sizeof(IND_TYPE) * dview.indices().size();
   ArrowArrayBuffer(arr, 1)->data       = const_cast<uint8_t*>(dview.indices().data<uint8_t>());
 
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index fb346dad538..626aeb53cdd 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -217,7 +217,8 @@ get_nanoarrow_tables(cudf::size_type length)
   populate_from_col<cudf::string_view>(arrow->children[5]->children[1], struct_view.child(1));
   arrow->children[5]->length     = struct_view.size();
   arrow->children[5]->null_count = struct_view.null_count();
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arrow->children[5], 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(
+    ArrowBufferSetAllocator(ArrowArrayBuffer(arrow->children[5], 0), noop_alloc));
   ArrowArrayValidityBitmap(arrow->children[5])->buffer.size_bytes =
     cudf::bitmask_allocation_size_bytes(struct_view.size());
   ArrowArrayValidityBitmap(arrow->children[5])->buffer.data =
@@ -241,13 +242,13 @@ void populate_list_from_col(ArrowArray* arr, cudf::lists_column_view view)
   arr->length     = view.size();
   arr->null_count = view.null_count();
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
   ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
     cudf::bitmask_allocation_size_bytes(view.size());
   ArrowArrayValidityBitmap(arr)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
   ArrowArrayBuffer(arr, 1)->size_bytes = sizeof(int32_t) * view.offsets().size();
   ArrowArrayBuffer(arr, 1)->data       = const_cast<uint8_t*>(view.offsets().data<uint8_t>());
 }

From c62c5f69ca5036d69188ab8e43ac2ab5276d6cfa Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Fri, 26 Apr 2024 04:02:25 -0500
Subject: [PATCH 127/272] Fix a JNI bug in JSON parsing fixup (#15550)

When parsing JSON in the current code if no columns can be parsed out of the data, then an empty table is returned. Earlier we put in a work around to this so that we could pass in the number of rows needed and the JSON parsing code would make a table of null values for it. This had some issues with structs and lists which needed an extended way to produce the null scalar. This adds in code to do just that.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/15550
---
 java/src/main/java/ai/rapids/cudf/Schema.java | 28 ++++++++++++++++++-
 java/src/main/java/ai/rapids/cudf/Table.java  | 22 +++++++++++++--
 2 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Schema.java b/java/src/main/java/ai/rapids/cudf/Schema.java
index c8571dd841c..43603386649 100644
--- a/java/src/main/java/ai/rapids/cudf/Schema.java
+++ b/java/src/main/java/ai/rapids/cudf/Schema.java
@@ -20,6 +20,7 @@
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.stream.Collectors;
 
 /**
  * The schema of data to be read in.
@@ -221,6 +222,13 @@ public DType[] getChildTypes() {
     return ret;
   }
 
+  public int getNumChildren() {
+    if (childSchemas == null) {
+      return 0;
+    }
+    return childSchemas.size();
+  }
+
   int[] getFlattenedNumChildren() {
     flattenIfNeeded();
     return flattenedCounts;
@@ -243,7 +251,25 @@ public boolean isStructOrHasStructDescendant() {
     return false;
   }
 
-  public static class Builder {
+  public HostColumnVector.DataType asHostDataType() {
+    if (topLevelType == DType.LIST) {
+      assert(childSchemas != null && childSchemas.size() == 1);
+      HostColumnVector.DataType element = childSchemas.get(0).asHostDataType();
+      return new HostColumnVector.ListType(true, element);
+    } else if (topLevelType == DType.STRUCT) {
+      if (childSchemas == null) {
+        return new HostColumnVector.StructType(true);
+      } else {
+        List<HostColumnVector.DataType> childTypes =
+                childSchemas.stream().map(Schema::asHostDataType).collect(Collectors.toList());
+        return new HostColumnVector.StructType(true, childTypes);
+      }
+    } else {
+      return new HostColumnVector.BasicType(true, topLevelType);
+    }
+  }
+
+    public static class Builder {
     private final DType topLevelType;
     private final List<String> names;
     private final List<Builder> types;
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 4038b3a40b8..4e737451ed6 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -1220,8 +1220,26 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emp
               columns[i] = tbl.getColumn(index).incRefCount();
             }
           } else {
-            try (Scalar s = Scalar.fromNull(types[i])) {
-              columns[i] = ColumnVector.fromScalar(s, rowCount);
+            if (types[i] == DType.LIST) {
+              Schema listSchema = schema.getChild(i);
+              Schema elementSchema = listSchema.getChild(0);
+              try (Scalar s = Scalar.listFromNull(elementSchema.asHostDataType())) {
+                columns[i] = ColumnVector.fromScalar(s, rowCount);
+              }
+            } else if (types[i] == DType.STRUCT) {
+              Schema structSchema = schema.getChild(i);
+              int numStructChildren = structSchema.getNumChildren();
+              DataType[] structChildrenTypes = new DataType[numStructChildren];
+              for (int j = 0; j < numStructChildren; j++) {
+                structChildrenTypes[j] = structSchema.getChild(j).asHostDataType();
+              }
+              try (Scalar s = Scalar.structFromNull(structChildrenTypes)) {
+                columns[i] = ColumnVector.fromScalar(s, rowCount);
+              }
+            } else {
+              try (Scalar s = Scalar.fromNull(types[i])) {
+                columns[i] = ColumnVector.fromScalar(s, rowCount);
+              }
             }
           }
         }

From 79cd473f8ec18d1f0abed3faa6dd8d61f54bf384 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastian@sipsolutions.net>
Date: Fri, 26 Apr 2024 20:51:03 +0200
Subject: [PATCH 128/272] Minor fixups for future NumPy 2 compatibility
 (#15590)

These are some small fixes to be compatible with NumPy 2 python changes, as pointed out by the `ruff` with the `"NPY201"` rule-set.

I am not really happy with the `_NUMPY_SCTYPES` (reaching into what is now private, but figured that others will do so also for a while; feels like we should add a better way to do this in NumPy before removing it).
Listing the full set is also a bit ugly/convoluted, but happy to do so instead.

(I was hoping to get a bit further with testing against the NumPy 2rc, but unfortunately the `numba` dependency makes that at least difficult.)

Authors:
  - Sebastian Berg (https://github.com/seberg)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15590
---
 python/cudf/cudf/core/column/string.py     |  2 +-
 python/cudf/cudf/core/frame.py             | 12 ++++----
 python/cudf/cudf/core/index.py             |  6 +++-
 python/cudf/cudf/tests/test_api_types.py   | 33 ----------------------
 python/cudf/cudf/tests/test_categorical.py |  4 +--
 python/cudf/cudf/tests/test_dataframe.py   | 16 +++++------
 python/cudf/cudf/tests/test_parquet.py     |  2 +-
 python/cudf/cudf/tests/test_stats.py       |  4 +--
 python/cudf/cudf/utils/dtypes.py           |  8 ++++--
 9 files changed, 31 insertions(+), 56 deletions(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 0862995bc46..8143e7919a7 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -692,7 +692,7 @@ def contains(
 
         Returning an Index of booleans using only a literal pattern.
 
-        >>> data = ['Mouse', 'dog', 'house and parrot', '23.0', np.NaN]
+        >>> data = ['Mouse', 'dog', 'house and parrot', '23.0', np.nan]
         >>> idx = cudf.Index(data)
         >>> idx
         Index(['Mouse', 'dog', 'house and parrot', '23.0', None], dtype='object')
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 01842b5f0a9..cd42bf52ea1 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1077,7 +1077,7 @@ def isna(self):
         >>> import cudf
         >>> import numpy as np
         >>> import pandas as pd
-        >>> df = cudf.DataFrame({'age': [5, 6, np.NaN],
+        >>> df = cudf.DataFrame({'age': [5, 6, np.nan],
         ...                    'born': [pd.NaT, pd.Timestamp('1939-05-27'),
         ...                             pd.Timestamp('1940-04-25')],
         ...                    'name': ['Alfred', 'Batman', ''],
@@ -1095,7 +1095,7 @@ def isna(self):
 
         Show which entries in a Series are NA.
 
-        >>> ser = cudf.Series([5, 6, np.NaN, np.inf, -np.inf])
+        >>> ser = cudf.Series([5, 6, np.nan, np.inf, -np.inf])
         >>> ser
         0     5.0
         1     6.0
@@ -1113,7 +1113,7 @@ def isna(self):
 
         Show which entries in an Index are NA.
 
-        >>> idx = cudf.Index([1, 2, None, np.NaN, 0.32, np.inf])
+        >>> idx = cudf.Index([1, 2, None, np.nan, 0.32, np.inf])
         >>> idx
         Index([1.0, 2.0, <NA>, <NA>, 0.32, Inf], dtype='float64')
         >>> idx.isna()
@@ -1156,7 +1156,7 @@ def notna(self):
         >>> import cudf
         >>> import numpy as np
         >>> import pandas as pd
-        >>> df = cudf.DataFrame({'age': [5, 6, np.NaN],
+        >>> df = cudf.DataFrame({'age': [5, 6, np.nan],
         ...                    'born': [pd.NaT, pd.Timestamp('1939-05-27'),
         ...                             pd.Timestamp('1940-04-25')],
         ...                    'name': ['Alfred', 'Batman', ''],
@@ -1174,7 +1174,7 @@ def notna(self):
 
         Show which entries in a Series are NA.
 
-        >>> ser = cudf.Series([5, 6, np.NaN, np.inf, -np.inf])
+        >>> ser = cudf.Series([5, 6, np.nan, np.inf, -np.inf])
         >>> ser
         0     5.0
         1     6.0
@@ -1192,7 +1192,7 @@ def notna(self):
 
         Show which entries in an Index are NA.
 
-        >>> idx = cudf.Index([1, 2, None, np.NaN, 0.32, np.inf])
+        >>> idx = cudf.Index([1, 2, None, np.nan, 0.32, np.inf])
         >>> idx
         Index([1.0, 2.0, <NA>, <NA>, 0.32, Inf], dtype='float64')
         >>> idx.notna()
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 6c0acdc5fb0..f55fa4c05b5 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -60,6 +60,7 @@
 from cudf.core.single_column_frame import SingleColumnFrame
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
+    _NUMPY_SCTYPES,
     _maybe_convert_to_default_type,
     find_common_type,
     is_mixed_with_object_dtype,
@@ -344,7 +345,10 @@ def _data(self):
     @_cudf_nvtx_annotate
     def __contains__(self, item):
         if isinstance(item, bool) or not isinstance(
-            item, tuple(np.sctypes["int"] + np.sctypes["float"] + [int, float])
+            item,
+            tuple(
+                _NUMPY_SCTYPES["int"] + _NUMPY_SCTYPES["float"] + [int, float]
+            ),
         ):
             return False
         try:
diff --git a/python/cudf/cudf/tests/test_api_types.py b/python/cudf/cudf/tests/test_api_types.py
index 9436d65e0b7..4abe210c6ea 100644
--- a/python/cudf/cudf/tests/test_api_types.py
+++ b/python/cudf/cudf/tests/test_api_types.py
@@ -33,7 +33,6 @@
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -42,7 +41,6 @@
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -61,7 +59,6 @@
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -142,7 +139,6 @@ def test_is_categorical_dtype(obj, expect):
         (np.float64, True),
         (np.complex128, True),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -151,7 +147,6 @@ def test_is_categorical_dtype(obj, expect):
         (np.float64(), True),
         (np.complex128(), True),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -170,7 +165,6 @@ def test_is_categorical_dtype(obj, expect):
         (np.array([], dtype=np.float64), True),
         (np.array([], dtype=np.complex128), True),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -247,7 +241,6 @@ def test_is_numeric_dtype(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -256,7 +249,6 @@ def test_is_numeric_dtype(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -275,7 +267,6 @@ def test_is_numeric_dtype(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -352,7 +343,6 @@ def test_is_integer_dtype(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -361,7 +351,6 @@ def test_is_integer_dtype(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -380,7 +369,6 @@ def test_is_integer_dtype(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -458,7 +446,6 @@ def test_is_integer(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, True),
-        (np.unicode_, True),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -467,7 +454,6 @@ def test_is_integer(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), True),
-        (np.unicode_(), True),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -486,7 +472,6 @@ def test_is_integer(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), True),
-        (np.array([], dtype=np.unicode_), True),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         # (np.array([], dtype=object), False),
@@ -577,7 +562,6 @@ def test_is_string_dtype(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, True),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -586,7 +570,6 @@ def test_is_string_dtype(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), True),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -605,7 +588,6 @@ def test_is_string_dtype(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), True),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -682,7 +664,6 @@ def test_is_datetime_dtype(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -691,7 +672,6 @@ def test_is_datetime_dtype(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -710,7 +690,6 @@ def test_is_datetime_dtype(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -787,7 +766,6 @@ def test_is_list_dtype(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -796,7 +774,6 @@ def test_is_list_dtype(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -815,7 +792,6 @@ def test_is_list_dtype(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -895,7 +871,6 @@ def test_is_struct_dtype(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -904,7 +879,6 @@ def test_is_struct_dtype(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -923,7 +897,6 @@ def test_is_struct_dtype(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -1004,7 +977,6 @@ def test_is_decimal_dtype(obj, expect):
         np.float64,
         np.complex128,
         np.str_,
-        np.unicode_,
         np.datetime64,
         np.timedelta64,
         # NumPy scalars.
@@ -1013,7 +985,6 @@ def test_is_decimal_dtype(obj, expect):
         np.float64(),
         np.complex128(),
         np.str_(),
-        np.unicode_(),
         np.datetime64(),
         np.timedelta64(),
         # NumPy dtype objects.
@@ -1032,7 +1003,6 @@ def test_is_decimal_dtype(obj, expect):
         np.array([], dtype=np.float64),
         np.array([], dtype=np.complex128),
         np.array([], dtype=np.str_),
-        np.array([], dtype=np.unicode_),
         np.array([], dtype=np.datetime64),
         np.array([], dtype=np.timedelta64),
         np.array([], dtype=object),
@@ -1088,7 +1058,6 @@ def test_pandas_agreement(obj):
         np.float64,
         np.complex128,
         np.str_,
-        np.unicode_,
         np.datetime64,
         np.timedelta64,
         # NumPy scalars.
@@ -1097,7 +1066,6 @@ def test_pandas_agreement(obj):
         np.float64(),
         np.complex128(),
         np.str_(),
-        np.unicode_(),
         np.datetime64(),
         np.timedelta64(),
         # NumPy dtype objects.
@@ -1116,7 +1084,6 @@ def test_pandas_agreement(obj):
         np.array([], dtype=np.float64),
         np.array([], dtype=np.complex128),
         np.array([], dtype=np.str_),
-        np.array([], dtype=np.unicode_),
         np.array([], dtype=np.datetime64),
         np.array([], dtype=np.timedelta64),
         np.array([], dtype=object),
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index e21fd53bee4..7aba2e45532 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -460,7 +460,7 @@ def test_categorical_dataframe_slice_copy():
         pd.Series(["1.0", "2.5", "3.001", None, "9"], dtype="category"),
         pd.Series(["a", "b", "c", "c", "b", "a", "b", "b"]),
         pd.Series(["aa", "b", "c", "c", "bb", "bb", "a", "b", "b"]),
-        pd.Series([1, 2, 3, 89, None, np.nan, np.NaN], dtype="float64"),
+        pd.Series([1, 2, 3, 89, None, np.nan, np.nan], dtype="float64"),
         pd.Series([1, 2, 3, 89], dtype="float64"),
         pd.Series([1, 2.5, 3.001, 89], dtype="float64"),
         pd.Series([None, None, None]),
@@ -493,7 +493,7 @@ def test_categorical_typecast(data, cat_type):
         pd.Series([1, 2, 3, 89]),
         pd.Series(["a", "b", "c", "c", "b", "a", "b", "b"]),
         pd.Series(["aa", "b", "c", "c", "bb", "bb", "a", "b", "b"]),
-        pd.Series([1, 2, 3, 89, None, np.nan, np.NaN], dtype="float64"),
+        pd.Series([1, 2, 3, 89, None, np.nan, np.nan], dtype="float64"),
         pd.Series([1, 2, 3, 89], dtype="float64"),
         pd.Series([1, 2.5, 3.001, 89], dtype="float64"),
         pd.Series([None, None, None]),
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 59e8b41e51a..e287603de07 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -5199,20 +5199,20 @@ def test_df_constructor_dtype(dtype):
         cudf.DataFrame(
             {
                 "a": [1, 2, 3, 4],
-                "b": [7, np.NaN, 9, 10],
+                "b": [7, np.nan, 9, 10],
                 "c": cudf.Series(
-                    [np.NaN, np.NaN, np.NaN, np.NaN], nan_as_null=False
+                    [np.nan, np.nan, np.nan, np.nan], nan_as_null=False
                 ),
                 "d": cudf.Series([None, None, None, None], dtype="int64"),
                 "e": [100, None, 200, None],
-                "f": cudf.Series([10, None, np.NaN, 11], nan_as_null=False),
+                "f": cudf.Series([10, None, np.nan, 11], nan_as_null=False),
             }
         ),
         cudf.DataFrame(
             {
                 "a": [10, 11, 12, 13, 14, 15],
                 "b": cudf.Series(
-                    [10, None, np.NaN, 2234, None, np.NaN], nan_as_null=False
+                    [10, None, np.nan, 2234, None, np.nan], nan_as_null=False
                 ),
             }
         ),
@@ -5264,11 +5264,11 @@ def test_rowwise_ops_nullable_dtypes_all_null(op):
     gdf = cudf.DataFrame(
         {
             "a": [1, 2, 3, 4],
-            "b": [7, np.NaN, 9, 10],
-            "c": cudf.Series([np.NaN, np.NaN, np.NaN, np.NaN], dtype=float),
+            "b": [7, np.nan, 9, 10],
+            "c": cudf.Series([np.nan, np.nan, np.nan, np.nan], dtype=float),
             "d": cudf.Series([None, None, None, None], dtype="int64"),
             "e": [100, None, 200, None],
-            "f": cudf.Series([10, None, np.NaN, 11], nan_as_null=False),
+            "f": cudf.Series([10, None, np.nan, 11], nan_as_null=False),
         }
     )
 
@@ -5300,7 +5300,7 @@ def test_rowwise_ops_nullable_dtypes_partial_null(op):
         {
             "a": [10, 11, 12, 13, 14, 15],
             "b": cudf.Series(
-                [10, None, np.NaN, 2234, None, np.NaN],
+                [10, None, np.nan, 2234, None, np.nan],
                 nan_as_null=False,
             ),
         }
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 56a4281aad9..6fb1d3d8ba5 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -211,7 +211,7 @@ def make_pdf(nrows, ncolumns=1, nvalids=0, dtype=np.int64):
         # Randomly but reproducibly mark subset of rows as invalid
         random.seed(1337)
         mask = random.sample(range(nrows), nvalids)
-        test_pdf[test_pdf.index.isin(mask)] = np.NaN
+        test_pdf[test_pdf.index.isin(mask)] = np.nan
     if dtype:
         test_pdf = test_pdf.astype(dtype)
 
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index b9eb42906e8..27811d0fcde 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -507,7 +507,7 @@ def test_df_corr(method):
 @pytest.mark.parametrize(
     "data",
     [
-        [0.0, 1, 3, 6, np.NaN, 7, 5.0, np.nan, 5, 2, 3, -100],
+        [0.0, 1, 3, 6, np.nan, 7, 5.0, np.nan, 5, 2, 3, -100],
         [np.nan] * 3,
         [1, 5, 3],
         [],
@@ -555,7 +555,7 @@ def test_nans_stats(data, ops, skipna):
 @pytest.mark.parametrize(
     "data",
     [
-        [0.0, 1, 3, 6, np.NaN, 7, 5.0, np.nan, 5, 2, 3, -100],
+        [0.0, 1, 3, 6, np.nan, 7, 5.0, np.nan, 5, 2, 3, -100],
         [np.nan] * 3,
         [1, 5, 3],
     ],
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index a33b5ca139c..2aa3129ab30 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -91,6 +91,10 @@
 BOOL_TYPES = {"bool"}
 ALL_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | OTHER_TYPES
 
+# The NumPy scalar types are a bit of a mess as they align with the C types
+# so for now we use the `sctypes` dict (although it was made private in 2.0)
+_NUMPY_SCTYPES = np.sctypes if hasattr(np, "sctypes") else np._core.sctypes
+
 
 def np_to_pa_dtype(dtype):
     """Util to convert numpy dtype to PyArrow dtype."""
@@ -335,7 +339,7 @@ def min_signed_type(x, min_size=8):
     Return the smallest *signed* integer dtype
     that can represent the integer ``x``
     """
-    for int_dtype in np.sctypes["int"]:
+    for int_dtype in _NUMPY_SCTYPES["int"]:
         if (cudf.dtype(int_dtype).itemsize * 8) >= min_size:
             if np.iinfo(int_dtype).min <= x <= np.iinfo(int_dtype).max:
                 return int_dtype
@@ -348,7 +352,7 @@ def min_unsigned_type(x, min_size=8):
     Return the smallest *unsigned* integer dtype
     that can represent the integer ``x``
     """
-    for int_dtype in np.sctypes["uint"]:
+    for int_dtype in _NUMPY_SCTYPES["uint"]:
         if (cudf.dtype(int_dtype).itemsize * 8) >= min_size:
             if 0 <= x <= np.iinfo(int_dtype).max:
                 return int_dtype

From d91a4add4c56d35f0ed2fb7f12c87bc3c26f28d9 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Fri, 26 Apr 2024 16:15:36 -0700
Subject: [PATCH 129/272] Add Parquet encoding statistics to column chunk
 metadata (#15452)

Closes #15313

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/15452
---
 .../io/parquet/compact_protocol_reader.cpp    | 13 +++++
 .../io/parquet/compact_protocol_reader.hpp    |  1 +
 .../io/parquet/compact_protocol_writer.cpp    | 10 ++++
 .../io/parquet/compact_protocol_writer.hpp    |  1 +
 cpp/src/io/parquet/parquet.hpp                | 46 +++++++++++++----
 cpp/src/io/parquet/writer_impl.cu             | 50 +++++++++++++++++++
 cpp/tests/io/parquet_writer_test.cpp          | 13 ++++-
 7 files changed, 122 insertions(+), 12 deletions(-)

diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index 04a22b41247..a3b58347e20 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -17,6 +17,7 @@
 #include "compact_protocol_reader.hpp"
 
 #include "parquet.hpp"
+#include "parquet_common.hpp"
 
 #include <cudf/utilities/error.hpp>
 
@@ -652,6 +653,9 @@ void CompactProtocolReader::read(ColumnChunkMetaData* c)
 {
   using optional_size_statistics =
     parquet_field_optional<SizeStatistics, parquet_field_struct<SizeStatistics>>;
+  using optional_list_enc_stats =
+    parquet_field_optional<std::vector<PageEncodingStats>,
+                           parquet_field_struct_list<PageEncodingStats>>;
   auto op = std::make_tuple(parquet_field_enum<Type>(1, c->type),
                             parquet_field_enum_list(2, c->encodings),
                             parquet_field_string_list(3, c->path_in_schema),
@@ -663,6 +667,7 @@ void CompactProtocolReader::read(ColumnChunkMetaData* c)
                             parquet_field_int64(10, c->index_page_offset),
                             parquet_field_int64(11, c->dictionary_page_offset),
                             parquet_field_struct(12, c->statistics),
+                            optional_list_enc_stats(13, c->encoding_stats),
                             optional_size_statistics(16, c->size_statistics));
   function_builder(this, op);
 }
@@ -774,6 +779,14 @@ void CompactProtocolReader::read(ColumnOrder* c)
   function_builder(this, op);
 }
 
+void CompactProtocolReader::read(PageEncodingStats* s)
+{
+  auto op = std::make_tuple(parquet_field_enum<PageType>(1, s->page_type),
+                            parquet_field_enum<Encoding>(2, s->encoding),
+                            parquet_field_int32(3, s->count));
+  function_builder(this, op);
+}
+
 void CompactProtocolReader::read(SortingColumn* s)
 {
   auto op = std::make_tuple(parquet_field_int32(1, s->column_idx),
diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
index 2ad336a3052..bcc9adfc8c0 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.hpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -120,6 +120,7 @@ class CompactProtocolReader {
   void read(ColumnIndex* c);
   void read(Statistics* s);
   void read(ColumnOrder* c);
+  void read(PageEncodingStats* s);
   void read(SortingColumn* s);
 
  public:
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index 1262ca1926d..2174fe46663 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -188,6 +188,7 @@ size_t CompactProtocolWriter::write(ColumnChunkMetaData const& s)
   if (s.index_page_offset != 0) { c.field_int(10, s.index_page_offset); }
   if (s.dictionary_page_offset != 0) { c.field_int(11, s.dictionary_page_offset); }
   c.field_struct(12, s.statistics);
+  if (s.encoding_stats.has_value()) { c.field_struct_list(13, s.encoding_stats.value()); }
   if (s.size_statistics.has_value()) { c.field_struct(16, s.size_statistics.value()); }
   return c.value();
 }
@@ -248,6 +249,15 @@ size_t CompactProtocolWriter::write(ColumnOrder const& co)
   return c.value();
 }
 
+size_t CompactProtocolWriter::write(PageEncodingStats const& enc)
+{
+  CompactProtocolFieldWriter c(*this);
+  c.field_int(1, static_cast<int32_t>(enc.page_type));
+  c.field_int(2, static_cast<int32_t>(enc.encoding));
+  c.field_int(3, enc.count);
+  return c.value();
+}
+
 size_t CompactProtocolWriter::write(SortingColumn const& sc)
 {
   CompactProtocolFieldWriter c(*this);
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index 2e39abadd24..c2e6178acbf 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -53,6 +53,7 @@ class CompactProtocolWriter {
   size_t write(OffsetIndex const&);
   size_t write(SizeStatistics const&);
   size_t write(ColumnOrder const&);
+  size_t write(PageEncodingStats const&);
   size_t write(SortingColumn const&);
 
  protected:
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 7f00d63b9c2..fe9b6ead6d4 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -322,6 +322,15 @@ struct ColumnIndex {
   thrust::optional<std::vector<int64_t>> definition_level_histogram;
 };
 
+/**
+ * @brief Thrift-derived struct describing page encoding statistics
+ */
+struct PageEncodingStats {
+  PageType page_type;  // The page type (data/dic/...)
+  Encoding encoding;   // Encoding of the page
+  int32_t count;       // Number of pages of this type with this encoding
+};
+
 /**
  * @brief Thrift-derived struct describing column sort order
  */
@@ -335,21 +344,36 @@ struct SortingColumn {
  * @brief Thrift-derived struct describing a column chunk
  */
 struct ColumnChunkMetaData {
+  // Type of this column
   Type type = BOOLEAN;
+  // Set of all encodings used for this column. The purpose is to validate
+  // whether we can decode those pages.
   std::vector<Encoding> encodings;
+  // Path in schema
   std::vector<std::string> path_in_schema;
-  Compression codec  = UNCOMPRESSED;
+  // Compression codec
+  Compression codec = UNCOMPRESSED;
+  // Number of values in this column
   int64_t num_values = 0;
-  int64_t total_uncompressed_size =
-    0;  // total byte size of all uncompressed pages in this column chunk (including the headers)
-  int64_t total_compressed_size =
-    0;  // total byte size of all compressed pages in this column chunk (including the headers)
-  int64_t data_page_offset  = 0;  // Byte offset from beginning of file to first data page
-  int64_t index_page_offset = 0;  // Byte offset from beginning of file to root index page
-  int64_t dictionary_page_offset =
-    0;                    // Byte offset from the beginning of file to first (only) dictionary page
-  Statistics statistics;  // Encoded chunk-level statistics
-  thrust::optional<SizeStatistics> size_statistics;  // Size statistics for the chunk
+  // Total byte size of all uncompressed pages in this column chunk (including the headers)
+  int64_t total_uncompressed_size = 0;
+  // Total byte size of all compressed pages in this column chunk (including the headers)
+  int64_t total_compressed_size = 0;
+  // Byte offset from beginning of file to first data page
+  int64_t data_page_offset = 0;
+  // Byte offset from beginning of file to root index page
+  int64_t index_page_offset = 0;
+  // Byte offset from the beginning of file to first (only) dictionary page
+  int64_t dictionary_page_offset = 0;
+  // Optional statistics for this column chunk
+  Statistics statistics;
+  // Set of all encodings used for pages in this column chunk. This information can be used to
+  // determine if all data pages are dictionary encoded for example.
+  thrust::optional<std::vector<PageEncodingStats>> encoding_stats;
+  // Optional statistics to help estimate total memory when converted to in-memory representations.
+  // The histograms contained in these statistics can also be useful in some cases for more
+  // fine-grained nullability/list length filter pushdown.
+  thrust::optional<SizeStatistics> size_statistics;
 };
 
 /**
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 5509a33f9f0..286c7b361a9 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -22,6 +22,8 @@
 #include "compact_protocol_reader.hpp"
 #include "compact_protocol_writer.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
+#include "io/parquet/parquet.hpp"
+#include "io/parquet/parquet_gpu.hpp"
 #include "io/statistics/column_statistics.cuh"
 #include "io/utilities/column_utils.cuh"
 #include "io/utilities/config_utils.hpp"
@@ -214,6 +216,53 @@ void update_chunk_encodings(std::vector<Encoding>& encodings, uint32_t enc_mask)
   }
 }
 
+/**
+ * @brief Update the encoding_stats field in the column chunk metadata.
+ *
+ * @param chunk_meta The `ColumnChunkMetaData` struct for the column chunk
+ * @param ck The column chunk to summarize stats for
+ * @param is_v2 True if V2 page headers are used
+ */
+void update_chunk_encoding_stats(ColumnChunkMetaData& chunk_meta,
+                                 EncColumnChunk const& ck,
+                                 bool is_v2)
+{
+  // don't set encoding stats if there are no pages
+  if (ck.num_pages == 0) { return; }
+
+  // NOTE: since cudf doesn't use mixed encodings for a chunk, we really only need to account
+  // for the dictionary page (if there is one), and the encoding used for the data pages. We can
+  // examine the chunk's encodings field to figure out the encodings without having to examine
+  // the page data.
+  auto const num_data_pages = static_cast<int32_t>(ck.num_data_pages());
+  auto const data_page_type = is_v2 ? PageType::DATA_PAGE_V2 : PageType::DATA_PAGE;
+
+  std::vector<PageEncodingStats> result;
+  if (ck.use_dictionary) {
+    // For dictionary encoding, if V1 then both data and dictionary use PLAIN_DICTIONARY. For V2
+    // the dictionary uses PLAIN and the data RLE_DICTIONARY.
+    auto const dict_enc = is_v2 ? Encoding::PLAIN : Encoding::PLAIN_DICTIONARY;
+    auto const data_enc = is_v2 ? Encoding::RLE_DICTIONARY : Encoding::PLAIN_DICTIONARY;
+    result.push_back({PageType::DICTIONARY_PAGE, dict_enc, 1});
+    if (num_data_pages > 0) { result.push_back({data_page_type, data_enc, num_data_pages}); }
+  } else {
+    // No dictionary page, the pages are encoded with something other than RLE (unless it's a
+    // boolean column).
+    for (auto const enc : chunk_meta.encodings) {
+      if (enc != Encoding::RLE) {
+        result.push_back({data_page_type, enc, num_data_pages});
+        break;
+      }
+    }
+    // if result is empty and we're using V2 headers, then assume the data is RLE as well
+    if (result.empty() and is_v2 and (ck.encodings & encoding_to_mask(Encoding::RLE)) != 0) {
+      result.push_back({data_page_type, Encoding::RLE, num_data_pages});
+    }
+  }
+
+  if (not result.empty()) { chunk_meta.encoding_stats = std::move(result); }
+}
+
 /**
  * @brief Compute size (in bytes) of the data stored in the given column.
  *
@@ -2144,6 +2193,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         max_write_size = std::max(max_write_size, ck.compressed_size);
 
         update_chunk_encodings(column_chunk_meta.encodings, ck.encodings);
+        update_chunk_encoding_stats(column_chunk_meta, ck, write_v2_headers);
 
         if (ck.ck_stat_size != 0) {
           std::vector<uint8_t> const stats_blob = cudf::detail::make_std_vector_sync(
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index a16b3d63177..e88afd73290 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -1674,7 +1674,18 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings)
   // no nulls and no repetition, so the only encoding used should be for the data.
   // since we're writing v1, both dict and data pages should use PLAIN_DICTIONARY.
   auto const expect_enc = [&fmd](int idx, cudf::io::parquet::detail::Encoding enc) {
-    EXPECT_EQ(fmd.row_groups[0].columns[idx].meta_data.encodings[0], enc);
+    auto const& col_meta = fmd.row_groups[0].columns[idx].meta_data;
+    EXPECT_EQ(col_meta.encodings[0], enc);
+
+    // also check encoding stats are written properly
+    ASSERT_TRUE(col_meta.encoding_stats.has_value());
+    auto const& enc_stats = col_meta.encoding_stats.value();
+    for (auto const& ec : enc_stats) {
+      if (ec.page_type == cudf::io::parquet::detail::PageType::DATA_PAGE) {
+        EXPECT_EQ(ec.encoding, enc);
+        EXPECT_EQ(ec.count, 1);
+      }
+    }
   };
 
   // requested plain

From 064dd7b02166cc67e882b708d66621bc3fafd70b Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Fri, 26 Apr 2024 16:20:32 -0700
Subject: [PATCH 130/272] Add fields to Parquet Statistics structure that were
 added in parquet-format 2.10 (#15412)

[PARQUET-2352](https://github.com/apache/parquet-format/pull/216) added fields to the `Statistics` struct to indicate whether the min and max values were exact or had been truncated. This was somewhat ambiguous in the past. One reason to want to know this is to allow avoiding the decoding of pages (or column chunks) that contain a single value (if the min and max are the same value, and are known to be exact values, and there are no nulls, then the only valid value for the page will be that value). This PR adds these new fields, which will always be `true` in cuDF since cuDF does not support truncating min and max values in the statistics (but does support truncation in the page indexes).

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/15412
---
 cpp/src/io/parquet/compact_protocol_reader.cpp | 5 ++++-
 cpp/src/io/parquet/compact_protocol_writer.cpp | 2 ++
 cpp/src/io/parquet/page_enc.cu                 | 3 +++
 cpp/src/io/parquet/parquet.hpp                 | 4 ++++
 cpp/tests/io/parquet_writer_test.cpp           | 6 ++++++
 5 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index a3b58347e20..c9212334a96 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -763,13 +763,16 @@ void CompactProtocolReader::read(Statistics* s)
 {
   using optional_binary = parquet_field_optional<std::vector<uint8_t>, parquet_field_binary>;
   using optional_int64  = parquet_field_optional<int64_t, parquet_field_int64>;
+  using optional_bool   = parquet_field_optional<bool, parquet_field_bool>;
 
   auto op = std::make_tuple(optional_binary(1, s->max),
                             optional_binary(2, s->min),
                             optional_int64(3, s->null_count),
                             optional_int64(4, s->distinct_count),
                             optional_binary(5, s->max_value),
-                            optional_binary(6, s->min_value));
+                            optional_binary(6, s->min_value),
+                            optional_bool(7, s->is_max_value_exact),
+                            optional_bool(8, s->is_min_value_exact));
   function_builder(this, op);
 }
 
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index 2174fe46663..14c99f728de 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -202,6 +202,8 @@ size_t CompactProtocolWriter::write(Statistics const& s)
   if (s.distinct_count.has_value()) { c.field_int(4, s.distinct_count.value()); }
   if (s.max_value.has_value()) { c.field_binary(5, s.max_value.value()); }
   if (s.min_value.has_value()) { c.field_binary(6, s.min_value.value()); }
+  if (s.is_max_value_exact.has_value()) { c.field_bool(7, s.is_max_value_exact.value()); }
+  if (s.is_min_value_exact.has_value()) { c.field_bool(8, s.is_min_value_exact.value()); }
   return c.value();
 }
 
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 227f13db60e..11b18579c58 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -2944,6 +2944,9 @@ __device__ uint8_t* EncodeStatistics(uint8_t* start,
     auto const [min_ptr, min_size] =
       get_extremum(&s->min_value, dtype, scratch, true, NO_TRUNC_STATS);
     encoder.field_binary(6, min_ptr, min_size);
+    // cudf min/max statistics are always exact (i.e. not truncated)
+    encoder.field_bool(7, true);
+    encoder.field_bool(8, true);
   }
   encoder.end(&end);
   return end;
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index fe9b6ead6d4..756726945cf 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -259,6 +259,10 @@ struct Statistics {
   thrust::optional<std::vector<uint8_t>> max_value;
   // min value for column determined by ColumnOrder
   thrust::optional<std::vector<uint8_t>> min_value;
+  // If true, max_value is the actual maximum value for a column
+  thrust::optional<bool> is_max_value_exact;
+  // If true, min_value is the actual minimum value for a column
+  thrust::optional<bool> is_min_value_exact;
 };
 
 /**
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index e88afd73290..3a8763ed9f3 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -903,6 +903,12 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation)
       ASSERT_TRUE(stats.min_value.has_value());
       ASSERT_TRUE(stats.max_value.has_value());
 
+      // check that min and max for the column chunk are exact (i.e. not truncated)
+      ASSERT_TRUE(stats.is_max_value_exact.has_value());
+      EXPECT_TRUE(stats.is_max_value_exact.value());
+      ASSERT_TRUE(stats.is_min_value_exact.has_value());
+      EXPECT_TRUE(stats.is_min_value_exact.value());
+
       // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max
       auto const ptype = fmd.schema[c + 1].type;
       auto const ctype = fmd.schema[c + 1].converted_type;

From ab5e3f3bc8924f3393ec839830865b57a4d309a3 Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Tue, 30 Apr 2024 09:10:06 +1000
Subject: [PATCH 131/272] Update developer guide with device_async_resource_ref
 guidelines (#15562)

Closes #15561

Updates guidance in libcudf DEVELOPER_GUIDE.md to cover resource refs and change examples to not use `device_memory_resource` pointers.

Authors:
  - Mark Harris (https://github.com/harrism)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15562
---
 .../developer_guide/DEVELOPER_GUIDE.md        | 46 ++++++++++++++-----
 1 file changed, 34 insertions(+), 12 deletions(-)

diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index ce9840050a9..23b129fdf4b 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -84,7 +84,7 @@ prefixed with an underscore.
 
 ```c++
 template <typename IteratorType>
-void algorithm_function(int x, rmm::cuda_stream_view s, rmm::device_memory_resource* mr)
+void algorithm_function(int x, rmm::cuda_stream_view s, rmm::device_async_resource_ref mr)
 {
   ...
 }
@@ -194,9 +194,10 @@ and produce `unique_ptr`s to owning objects as output. For example,
 std::unique_ptr<table> sort(table_view const& input);
 ```
 
-## rmm::device_memory_resource
+## Memory Resources
 
-libcudf allocates all device memory via RMM memory resources (MR). See the
+libcudf allocates all device memory via RMM memory resources (MR) or CUDA MRs. Either type
+can be passed to libcudf functions via `rmm::device_async_resource_ref` parameters. See the
 [RMM documentation](https://github.com/rapidsai/rmm/blob/main/README.md) for details.
 
 ### Current Device Memory Resource
@@ -206,6 +207,27 @@ RMM provides a "default" memory resource for each device that can be accessed an
 respectively. All memory resource parameters should be defaulted to use the return value of
 `rmm::mr::get_current_device_resource()`.
 
+### Resource Refs
+
+Memory resources are passed via resource ref parameters. A resource ref is a memory resource wrapper
+that enables consumers to specify properties of resources that they expect. These are defined
+in the `cuda::mr` namespace of libcu++, but RMM provides some convenience wrappers in
+`rmm/resource_ref.hpp`:
+ - `rmm::device_resource_ref` accepts a memory resource that provides synchronous allocation
+    of device-accessible memory.
+ - `rmm::device_async_resource_ref` accepts a memory resource that provides stream-ordered allocation
+    of device-accessible memory.
+ - `rmm::host_resource_ref` accepts a memory resource that provides synchronous allocation of host-
+    accessible memory.
+ - `rmm::host_async_resource_ref` accepts a memory resource that provides stream-ordered allocation
+    of host-accessible memory.
+ - `rmm::host_device_resource_ref` accepts a memory resource that provides synchronous allocation of
+    host- and device-accessible memory.
+ - `rmm::host_async_resource_ref` accepts a memory resource that provides stream-ordered allocation
+    of host- and device-accessible memory.
+
+See the libcu++ [docs on `resource_ref`](https://nvidia.github.io/cccl/libcudacxx/extended_api/memory_resource/resource_ref.html) for more information.
+
 ## cudf::column
 
 `cudf::column` is a core owning data structure in libcudf. Most libcudf public APIs produce either
@@ -519,23 +541,23 @@ how device memory is allocated.
 
 ### Output Memory
 
-Any libcudf API that allocates memory that is *returned* to a user must accept a pointer to a
-`device_memory_resource` as the last parameter. Inside the API, this memory resource must be used
-to allocate any memory for returned objects. It should therefore be passed into functions whose
-outputs will be returned. Example:
+Any libcudf API that allocates memory that is *returned* to a user must accept a
+`rmm::device_async_resource_ref` as the last parameter. Inside the API, this memory resource must
+be used to allocate any memory for returned objects. It should therefore be passed into functions
+whose outputs will be returned. Example:
 
 ```c++
 // Returned `column` contains newly allocated memory,
 // therefore the API must accept a memory resource pointer
 std::unique_ptr<column> returns_output_memory(
-  ..., rmm::device_memory_resource * mr = rmm::mr::get_current_device_resource());
+  ..., rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 // This API does not allocate any new *output* memory, therefore
 // a memory resource is unnecessary
 void does_not_allocate_output_memory(...);
 ```
 
-This rule automatically applies to all detail APIs that allocates memory. Any detail API may be
+This rule automatically applies to all detail APIs that allocate memory. Any detail API may be
 called by any public API, and therefore could be allocating memory that is returned to the user.
 To support such uses cases, all detail APIs allocating memory resources should accept an `mr`
 parameter. Callers are responsible for either passing through a provided `mr` or
@@ -549,7 +571,7 @@ obtained from `rmm::mr::get_current_device_resource()` for temporary memory allo
 
 ```c++
 rmm::device_buffer some_function(
-  ..., rmm::mr::device_memory_resource mr * = rmm::mr::get_current_device_resource()) {
+  ..., rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) {
     rmm::device_buffer returned_buffer(..., mr); // Returned buffer uses the passed in MR
     ...
     rmm::device_buffer temporary_buffer(...); // Temporary buffer uses default MR
@@ -561,11 +583,11 @@ rmm::device_buffer some_function(
 ### Memory Management
 
 libcudf code generally eschews raw pointers and direct memory allocation. Use RMM classes built to
-use `device_memory_resource`s for device memory allocation with automated lifetime management.
+use memory resources for device memory allocation with automated lifetime management.
 
 #### rmm::device_buffer
 Allocates a specified number of bytes of untyped, uninitialized device memory using a
-`device_memory_resource`. If no resource is explicitly provided, uses
+memory resource. If no `rmm::device_async_resource_ref` is explicitly provided, it uses
 `rmm::mr::get_current_device_resource()`.
 
 `rmm::device_buffer` is movable and copyable on a stream. A copy performs a deep copy of the

From 528758059e674333ac4ca9b783d5adce7d61248d Mon Sep 17 00:00:00 2001
From: DanialJavady96 <154250392+DanialJavady96@users.noreply.github.com>
Date: Tue, 30 Apr 2024 10:35:53 -0400
Subject: [PATCH 132/272] Refactor joins for conditional semis and antis
 (#14646)

Add a new kernel to be used for both semi and anti joins.
Add some new device functions for adding only one array of shared_memory for caching.

Tests pass on my 3080.

Authors:
  - https://github.com/DanialJavady96
  - Danial Javady (https://github.com/ZelboK)
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14646
---
 cpp/src/join/conditional_join.cu          | 149 +++++++++++++++++-----
 cpp/src/join/conditional_join_kernels.cuh |  94 ++++++++++++++
 cpp/src/join/join_common_utils.cuh        |  39 +++++-
 3 files changed, 249 insertions(+), 33 deletions(-)

diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index 095093d08e5..f02dee5f7f5 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -37,6 +37,99 @@
 namespace cudf {
 namespace detail {
 
+std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
+  table_view const& left,
+  table_view const& right,
+  ast::expression const& binary_predicate,
+  join_kind join_type,
+  std::optional<std::size_t> output_size,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  if (right.num_rows() == 0) {
+    switch (join_type) {
+      case join_kind::LEFT_ANTI_JOIN:
+        return std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
+      case join_kind::LEFT_SEMI_JOIN:
+        return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
+      default: CUDF_FAIL("Invalid join kind."); break;
+    }
+  } else if (left.num_rows() == 0) {
+    switch (join_type) {
+      case join_kind::LEFT_ANTI_JOIN: [[fallthrough]];
+      case join_kind::LEFT_SEMI_JOIN:
+        return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
+      default: CUDF_FAIL("Invalid join kind."); break;
+    }
+  }
+
+  auto const has_nulls = binary_predicate.may_evaluate_null(left, right, stream);
+
+  auto const parser =
+    ast::detail::expression_parser{binary_predicate, left, right, has_nulls, stream, mr};
+  CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8,
+               "The expression must produce a Boolean output.");
+
+  auto left_table  = table_device_view::create(left, stream);
+  auto right_table = table_device_view::create(right, stream);
+
+  detail::grid_1d const config(left.num_rows(), DEFAULT_JOIN_BLOCK_SIZE);
+  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
+
+  // TODO: Remove the output_size parameter. It is not needed because the
+  // output size is bounded by the size of the left table.
+  std::size_t join_size;
+  if (output_size.has_value()) {
+    join_size = *output_size;
+  } else {
+    // Allocate storage for the counter used to get the size of the join output
+    rmm::device_scalar<std::size_t> size(0, stream, mr);
+    if (has_nulls) {
+      compute_conditional_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
+        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+          *left_table, *right_table, join_type, parser.device_expression_data, false, size.data());
+    } else {
+      compute_conditional_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>
+        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+          *left_table, *right_table, join_type, parser.device_expression_data, false, size.data());
+    }
+    join_size = size.value(stream);
+  }
+
+  if (left.num_rows() == 0) {
+    return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
+  }
+
+  rmm::device_scalar<size_type> write_index(0, stream);
+
+  auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
+
+  auto const& join_output_l = left_indices->data();
+
+  if (has_nulls) {
+    conditional_join_anti_semi<DEFAULT_JOIN_BLOCK_SIZE, DEFAULT_JOIN_CACHE_SIZE, true>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_table,
+        *right_table,
+        join_type,
+        join_output_l,
+        write_index.data(),
+        parser.device_expression_data,
+        join_size);
+  } else {
+    conditional_join_anti_semi<DEFAULT_JOIN_BLOCK_SIZE, DEFAULT_JOIN_CACHE_SIZE, false>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_table,
+        *right_table,
+        join_type,
+        join_output_l,
+        write_index.data(),
+        parser.device_expression_data,
+        join_size);
+  }
+  return left_indices;
+}
+
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 conditional_join(table_view const& left,
@@ -50,9 +143,7 @@ conditional_join(table_view const& left,
   // We can immediately filter out cases where the right table is empty. In
   // some cases, we return all the rows of the left table with a corresponding
   // null index for the right table; in others, we return an empty output.
-  auto right_num_rows{right.num_rows()};
-  auto left_num_rows{left.num_rows()};
-  if (right_num_rows == 0) {
+  if (right.num_rows() == 0) {
     switch (join_type) {
       // Left, left anti, and full all return all the row indices from left
       // with a corresponding NULL from the right.
@@ -67,7 +158,7 @@ conditional_join(table_view const& left,
                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       default: CUDF_FAIL("Invalid join kind."); break;
     }
-  } else if (left_num_rows == 0) {
+  } else if (left.num_rows() == 0) {
     switch (join_type) {
       // Left, left anti, left semi, and inner joins all return empty sets.
       case join_kind::LEFT_JOIN:
@@ -101,8 +192,8 @@ conditional_join(table_view const& left,
 
   // For inner joins we support optimizing the join by launching one thread for
   // whichever table is larger rather than always using the left table.
-  auto swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
-  detail::grid_1d const config(swap_tables ? right_num_rows : left_num_rows,
+  auto swap_tables = (join_type == join_kind::INNER_JOIN) && (right.num_rows() > left.num_rows());
+  detail::grid_1d const config(swap_tables ? right.num_rows() : left.num_rows(),
                                DEFAULT_JOIN_BLOCK_SIZE);
   auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
   join_kind const kernel_join_type =
@@ -187,7 +278,7 @@ conditional_join(table_view const& left,
   // by any row in the left table.
   if (join_type == join_kind::FULL_JOIN) {
     auto complement_indices = detail::get_left_join_indices_complement(
-      join_indices.second, left_num_rows, right_num_rows, stream, mr);
+      join_indices.second, left.num_rows(), right.num_rows(), stream, mr);
     join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream);
   }
   return join_indices;
@@ -210,21 +301,19 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
   // We can immediately filter out cases where one table is empty. In
   // some cases, we return all the rows of the other table with a corresponding
   // null index for the empty table; in others, we return an empty output.
-  auto right_num_rows{right.num_rows()};
-  auto left_num_rows{left.num_rows()};
-  if (right_num_rows == 0) {
+  if (right.num_rows() == 0) {
     switch (join_type) {
       // Left, left anti, and full all return all the row indices from left
       // with a corresponding NULL from the right.
       case join_kind::LEFT_JOIN:
       case join_kind::LEFT_ANTI_JOIN:
-      case join_kind::FULL_JOIN: return left_num_rows;
+      case join_kind::FULL_JOIN: return left.num_rows();
       // Inner and left semi joins return empty output because no matches can exist.
       case join_kind::INNER_JOIN:
       case join_kind::LEFT_SEMI_JOIN: return 0;
       default: CUDF_FAIL("Invalid join kind."); break;
     }
-  } else if (left_num_rows == 0) {
+  } else if (left.num_rows() == 0) {
     switch (join_type) {
       // Left, left anti, left semi, and inner joins all return empty sets.
       case join_kind::LEFT_JOIN:
@@ -232,7 +321,7 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
       case join_kind::INNER_JOIN:
       case join_kind::LEFT_SEMI_JOIN: return 0;
       // Full joins need to return the trivial complement.
-      case join_kind::FULL_JOIN: return right_num_rows;
+      case join_kind::FULL_JOIN: return right.num_rows();
       default: CUDF_FAIL("Invalid join kind."); break;
     }
   }
@@ -254,8 +343,8 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
 
   // For inner joins we support optimizing the join by launching one thread for
   // whichever table is larger rather than always using the left table.
-  auto swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
-  detail::grid_1d const config(swap_tables ? right_num_rows : left_num_rows,
+  auto swap_tables = (join_type == join_kind::INNER_JOIN) && (right.num_rows() > left.num_rows());
+  detail::grid_1d const config(swap_tables ? right.num_rows() : left.num_rows(),
                                DEFAULT_JOIN_BLOCK_SIZE);
   auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
 
@@ -349,14 +438,13 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return std::move(detail::conditional_join(left,
-                                            right,
-                                            binary_predicate,
-                                            detail::join_kind::LEFT_SEMI_JOIN,
-                                            output_size,
-                                            cudf::get_default_stream(),
-                                            mr)
-                     .first);
+  return std::move(detail::conditional_join_anti_semi(left,
+                                                      right,
+                                                      binary_predicate,
+                                                      detail::join_kind::LEFT_SEMI_JOIN,
+                                                      output_size,
+                                                      cudf::get_default_stream(),
+                                                      mr));
 }
 
 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
@@ -367,14 +455,13 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return std::move(detail::conditional_join(left,
-                                            right,
-                                            binary_predicate,
-                                            detail::join_kind::LEFT_ANTI_JOIN,
-                                            output_size,
-                                            cudf::get_default_stream(),
-                                            mr)
-                     .first);
+  return std::move(detail::conditional_join_anti_semi(left,
+                                                      right,
+                                                      binary_predicate,
+                                                      detail::join_kind::LEFT_ANTI_JOIN,
+                                                      output_size,
+                                                      cudf::get_default_stream(),
+                                                      mr));
 }
 
 std::size_t conditional_inner_join_size(table_view const& left,
diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh
index cc57fa7b03b..5e190eb2b27 100644
--- a/cpp/src/join/conditional_join_kernels.cuh
+++ b/cpp/src/join/conditional_join_kernels.cuh
@@ -271,6 +271,100 @@ CUDF_KERNEL void conditional_join(table_device_view left_table,
   }
 }
 
+template <cudf::size_type block_size, cudf::size_type output_cache_size, bool has_nulls>
+CUDF_KERNEL void conditional_join_anti_semi(
+  table_device_view left_table,
+  table_device_view right_table,
+  join_kind join_type,
+  cudf::size_type* join_output_l,
+  cudf::size_type* current_idx,
+  cudf::ast::detail::expression_device_view device_expression_data,
+  cudf::size_type const max_size)
+{
+  constexpr int num_warps = block_size / detail::warp_size;
+  __shared__ cudf::size_type current_idx_shared[num_warps];
+  __shared__ cudf::size_type join_shared_l[num_warps][output_cache_size];
+
+  extern __shared__ char raw_intermediate_storage[];
+  cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
+    reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
+  auto thread_intermediate_storage =
+    &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
+
+  int const warp_id                            = threadIdx.x / detail::warp_size;
+  int const lane_id                            = threadIdx.x % detail::warp_size;
+  cudf::thread_index_type const outer_num_rows = left_table.num_rows();
+  cudf::thread_index_type const inner_num_rows = right_table.num_rows();
+  auto const stride                            = cudf::detail::grid_1d::grid_stride();
+  auto const start_idx                         = cudf::detail::grid_1d::global_thread_id();
+
+  if (0 == lane_id) { current_idx_shared[warp_id] = 0; }
+
+  __syncwarp();
+
+  unsigned int const activemask = __ballot_sync(0xffff'ffffu, start_idx < outer_num_rows);
+
+  auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
+    left_table, right_table, device_expression_data);
+
+  for (cudf::thread_index_type outer_row_index = start_idx; outer_row_index < outer_num_rows;
+       outer_row_index += stride) {
+    bool found_match = false;
+    for (thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
+         ++inner_row_index) {
+      auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
+
+      evaluator.evaluate(
+        output_dest, outer_row_index, inner_row_index, 0, thread_intermediate_storage);
+
+      if (output_dest.is_valid() && output_dest.value()) {
+        if (join_type == join_kind::LEFT_SEMI_JOIN && !found_match) {
+          add_left_to_cache(outer_row_index, current_idx_shared, warp_id, join_shared_l[warp_id]);
+        }
+        found_match = true;
+      }
+
+      __syncwarp(activemask);
+
+      auto const do_flush   = current_idx_shared[warp_id] + detail::warp_size >= output_cache_size;
+      auto const flush_mask = __ballot_sync(activemask, do_flush);
+      if (do_flush) {
+        flush_output_cache<num_warps, output_cache_size>(flush_mask,
+                                                         max_size,
+                                                         warp_id,
+                                                         lane_id,
+                                                         current_idx,
+                                                         current_idx_shared,
+                                                         join_shared_l,
+                                                         join_output_l);
+        __syncwarp(flush_mask);
+        if (0 == lane_id) { current_idx_shared[warp_id] = 0; }
+      }
+      __syncwarp(activemask);
+    }
+
+    if ((join_type == join_kind::LEFT_ANTI_JOIN) && (!found_match)) {
+      add_left_to_cache(outer_row_index, current_idx_shared, warp_id, join_shared_l[warp_id]);
+    }
+
+    __syncwarp(activemask);
+
+    auto const do_flush   = current_idx_shared[warp_id] > 0;
+    auto const flush_mask = __ballot_sync(activemask, do_flush);
+    if (do_flush) {
+      flush_output_cache<num_warps, output_cache_size>(flush_mask,
+                                                       max_size,
+                                                       warp_id,
+                                                       lane_id,
+                                                       current_idx,
+                                                       current_idx_shared,
+                                                       join_shared_l,
+                                                       join_output_l);
+    }
+    if (found_match) break;
+  }
+}
+
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index 9758919c5b4..31f267d5cfb 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -281,12 +281,21 @@ __inline__ __device__ void add_pair_to_cache(size_type const first,
                                              size_type* joined_shared_r)
 {
   size_type my_current_idx{atomicAdd(current_idx_shared + warp_id, size_type(1))};
-
   // its guaranteed to fit into the shared cache
   joined_shared_l[my_current_idx] = first;
   joined_shared_r[my_current_idx] = second;
 }
 
+__inline__ __device__ void add_left_to_cache(size_type const first,
+                                             size_type* current_idx_shared,
+                                             int const warp_id,
+                                             size_type* joined_shared_l)
+{
+  size_type my_current_idx{atomicAdd(current_idx_shared + warp_id, size_type(1))};
+
+  joined_shared_l[my_current_idx] = first;
+}
+
 template <int num_warps, cudf::size_type output_cache_size>
 __device__ void flush_output_cache(unsigned int const activemask,
                                    cudf::size_type const max_size,
@@ -300,7 +309,7 @@ __device__ void flush_output_cache(unsigned int const activemask,
                                    size_type* join_output_r)
 {
   // count how many active threads participating here which could be less than warp_size
-  int num_threads               = __popc(activemask);
+  int const num_threads         = __popc(activemask);
   cudf::size_type output_offset = 0;
 
   if (0 == lane_id) { output_offset = atomicAdd(current_idx, current_idx_shared[warp_id]); }
@@ -322,6 +331,32 @@ __device__ void flush_output_cache(unsigned int const activemask,
   }
 }
 
+template <int num_warps, cudf::size_type output_cache_size>
+__device__ void flush_output_cache(unsigned int const activemask,
+                                   cudf::size_type const max_size,
+                                   int const warp_id,
+                                   int const lane_id,
+                                   cudf::size_type* current_idx,
+                                   cudf::size_type current_idx_shared[num_warps],
+                                   size_type join_shared_l[num_warps][output_cache_size],
+                                   size_type* join_output_l)
+{
+  int const num_threads         = __popc(activemask);
+  cudf::size_type output_offset = 0;
+
+  if (0 == lane_id) { output_offset = atomicAdd(current_idx, current_idx_shared[warp_id]); }
+
+  output_offset = cub::ShuffleIndex<detail::warp_size>(output_offset, 0, activemask);
+
+  for (int shared_out_idx = lane_id; shared_out_idx < current_idx_shared[warp_id];
+       shared_out_idx += num_threads) {
+    cudf::size_type thread_offset = output_offset + shared_out_idx;
+    if (thread_offset < max_size) {
+      join_output_l[thread_offset] = join_shared_l[warp_id][shared_out_idx];
+    }
+  }
+}
+
 }  // namespace detail
 
 }  // namespace cudf

From b9c6d4c5f4bbbb75ec7b31bcdfc7546812806c32 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Tue, 30 Apr 2024 12:30:06 -0500
Subject: [PATCH 133/272] Deprecate `to/from_dask_dataframe` APIs in dask-cudf
 (#15592)

The `to/from_dask_dataframe` APIs have been obsolete for a long time. It is always better to use `ddf.to_backend("cudf")` or `ddf.to_backend("pandas")` instead.

These APIs are also "dangerous" to use with dask-expr, because the same API names are still used to convert data to/from "legacy" Dask collections. Note that dask-expr also deprecated `to/from_dask_dataframe` in favor of `to/from_legacy_dataframe`, but the conflicting APIs still exist (for now).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: https://github.com/rapidsai/cudf/pull/15592
---
 docs/dask_cudf/source/api.rst                 |  3 +-
 python/dask_cudf/dask_cudf/core.py            | 30 ++++++++++++++++---
 .../dask_cudf/dask_cudf/expr/_collection.py   | 29 ++++++++++++++----
 .../dask_cudf/io/tests/test_parquet.py        |  2 +-
 python/dask_cudf/dask_cudf/tests/test_core.py | 24 +++++++++++++++
 .../dask_cudf/dask_cudf/tests/test_groupby.py |  4 +--
 6 files changed, 77 insertions(+), 15 deletions(-)

diff --git a/docs/dask_cudf/source/api.rst b/docs/dask_cudf/source/api.rst
index db32f4bbcb3..ab10f4af4fa 100644
--- a/docs/dask_cudf/source/api.rst
+++ b/docs/dask_cudf/source/api.rst
@@ -13,12 +13,11 @@ Creating and storing DataFrames
 of DataFrames from a variety of storage formats. For on-disk data that
 are not supported directly in Dask-cuDF, we recommend using Dask's
 data reading facilities, followed by calling
-:func:`.from_dask_dataframe` to obtain a Dask-cuDF object.
+:meth:`*.to_backend("cudf")` to obtain a Dask-cuDF object.
 
 .. automodule:: dask_cudf
    :members:
       from_cudf,
-      from_dask_dataframe,
       from_delayed,
       read_csv,
       read_json,
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index bfe58531a73..3f0cfeb6d2c 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -55,9 +55,20 @@ def __repr__(self):
 
     @_dask_cudf_nvtx_annotate
     def to_dask_dataframe(self, **kwargs):
-        """Create a dask.dataframe object from a dask_cudf object"""
-        nullable = kwargs.get("nullable", False)
-        return self.map_partitions(M.to_pandas, nullable=nullable)
+        """Create a dask.dataframe object from a dask_cudf object
+
+        WARNING: This API is deprecated, and may not work properly
+        when query-planning is active. Please use `*.to_backend("pandas")`
+        to convert the underlying data to pandas.
+        """
+
+        warnings.warn(
+            "The `to_dask_dataframe` API is now deprecated. "
+            "Please use `*.to_backend('pandas')` instead.",
+            FutureWarning,
+        )
+
+        return self.to_backend("pandas", **kwargs)
 
 
 concat = dd.concat
@@ -733,6 +744,10 @@ def from_dask_dataframe(df):
     Convert a Dask :class:`dask.dataframe.DataFrame` to a Dask-cuDF
     one.
 
+    WARNING: This API is deprecated, and may not work properly
+    when query-planning is active. Please use `*.to_backend("cudf")`
+    to convert the underlying data to cudf.
+
     Parameters
     ----------
     df : dask.dataframe.DataFrame
@@ -742,7 +757,14 @@ def from_dask_dataframe(df):
     -------
     dask_cudf.DataFrame : A new Dask collection backed by cuDF objects
     """
-    return df.map_partitions(cudf.from_pandas)
+
+    warnings.warn(
+        "The `from_dask_dataframe` API is now deprecated. "
+        "Please use `*.to_backend('cudf')` instead.",
+        FutureWarning,
+    )
+
+    return df.to_backend("cudf")
 
 
 for name in (
diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py
index 516e35a4335..605a81f0fcd 100644
--- a/python/dask_cudf/dask_cudf/expr/_collection.py
+++ b/python/dask_cudf/dask_cudf/expr/_collection.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+import warnings
 from functools import cached_property
 
 from dask_expr import (
@@ -22,9 +23,25 @@
 ##
 
 
-# VarMixin can be removed if cudf#15179 is addressed.
-# See: https://github.com/rapidsai/cudf/issues/15179
-class VarMixin:
+class CudfFrameBase(FrameBase):
+    def to_dask_dataframe(self, **kwargs):
+        """Create a dask.dataframe object from a dask_cudf object
+
+        WARNING: This API is deprecated, and may not work properly.
+        Please use `*.to_backend("pandas")` to convert the
+        underlying data to pandas.
+        """
+
+        warnings.warn(
+            "The `to_dask_dataframe` API is now deprecated. "
+            "Please use `*.to_backend('pandas')` instead.",
+            FutureWarning,
+        )
+
+        return self.to_backend("pandas", **kwargs)
+
+    # var can be removed if cudf#15179 is addressed.
+    # See: https://github.com/rapidsai/cudf/issues/15179
     def var(
         self,
         axis=0,
@@ -49,7 +66,7 @@ def var(
         )
 
 
-class DataFrame(VarMixin, DXDataFrame):
+class DataFrame(DXDataFrame, CudfFrameBase):
     @classmethod
     def from_dict(cls, *args, **kwargs):
         with config.set({"dataframe.backend": "cudf"}):
@@ -94,7 +111,7 @@ def read_text(*args, **kwargs):
         return from_legacy_dataframe(ddf)
 
 
-class Series(VarMixin, DXSeries):
+class Series(DXSeries, CudfFrameBase):
     def groupby(self, by, **kwargs):
         from dask_cudf.expr._groupby import SeriesGroupBy
 
@@ -113,7 +130,7 @@ def struct(self):
         return StructMethods(self)
 
 
-class Index(DXIndex):
+class Index(DXIndex, CudfFrameBase):
     pass  # Same as pandas (for now)
 
 
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 68460653119..8ca27df8fec 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -113,7 +113,7 @@ def test_roundtrip_from_dask_none_index_false(tmpdir):
 @pytest.mark.parametrize("write_meta", [True, False])
 def test_roundtrip_from_dask_cudf(tmpdir, write_meta):
     tmpdir = str(tmpdir)
-    gddf = dask_cudf.from_dask_dataframe(ddf)
+    gddf = ddf.to_backend("cudf")
     gddf.to_parquet(tmpdir, write_metadata_file=write_meta)
 
     gddf2 = dask_cudf.read_parquet(tmpdir, calculate_divisions=True)
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index c6918c94559..4878d44d636 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -32,6 +32,30 @@ def test_from_dict_backend_dispatch():
     dd.assert_eq(expect, ddf)
 
 
+def test_to_dask_dataframe_deprecated():
+    gdf = cudf.DataFrame({"a": range(100)})
+    ddf = dd.from_pandas(gdf, npartitions=2)
+    assert isinstance(ddf._meta, cudf.DataFrame)
+
+    with pytest.warns(FutureWarning, match="API is now deprecated"):
+        assert isinstance(
+            ddf.to_dask_dataframe()._meta,
+            pd.DataFrame,
+        )
+
+
+def test_from_dask_dataframe_deprecated():
+    gdf = pd.DataFrame({"a": range(100)})
+    ddf = dd.from_pandas(gdf, npartitions=2)
+    assert isinstance(ddf._meta, pd.DataFrame)
+
+    with pytest.warns(FutureWarning, match="API is now deprecated"):
+        assert isinstance(
+            dask_cudf.from_dask_dataframe(ddf)._meta,
+            cudf.DataFrame,
+        )
+
+
 def test_to_backend():
     np.random.seed(0)
     data = {
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index 3bb3e3b0bb8..1e22dd95475 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -562,9 +562,9 @@ def test_groupby_reset_index_string_name():
 def test_groupby_categorical_key():
     # See https://github.com/rapidsai/cudf/issues/4608
     df = dask.datasets.timeseries()
-    gddf = dask_cudf.from_dask_dataframe(df)
+    gddf = df.to_backend("cudf")
     gddf["name"] = gddf["name"].astype("category")
-    ddf = gddf.to_dask_dataframe()
+    ddf = gddf.to_backend("pandas")
 
     got = gddf.groupby("name", sort=True).agg(
         {"x": ["mean", "max"], "y": ["mean", "count"]}

From f3206eabeafe1510e1484312c33e8e9be9c1d891 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Tue, 30 Apr 2024 14:33:54 -0400
Subject: [PATCH 134/272] Optimizing multi-source byte range reading in JSON
 reader (#15396)

This piece of work seeks to achieve two goals - (i) reducing repeated reading of byte range chunks in the JSON reader, and (ii) enabling multi-source byte range reading for chunks spanning sources.
- We expand on the idea outlined in #15185 to reduce the repeated reading of follow-on chunks while searching for the end of the last row in the requested chunk. After the requested chunk, the following chunks are divided into subchunks, and read until the delimiter character is reached.
- We estimate the buffer size needed for the entire byte range, and compute offsets per source into the buffer.

[Visualization of the performance improvement with this optimization](https://github.com/rapidsai/cudf/pull/15396#issuecomment-2044217170)

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - MithunR (https://github.com/mythrocks)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/15396
---
 cpp/include/cudf/io/detail/json.hpp           |  17 +-
 cpp/src/io/json/json_normalization.cu         |  30 ++-
 cpp/src/io/json/read_json.cu                  | 248 +++++++++++-------
 .../io/json_quote_normalization_test.cpp      |  25 +-
 cpp/tests/io/json_test.cpp                    | 105 ++++++++
 .../io/json_whitespace_normalization_test.cu  |  24 +-
 6 files changed, 313 insertions(+), 136 deletions(-)

diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index cf8e23c2d93..540a584908d 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/io/datasource.hpp>
 #include <cudf/io/json.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -56,22 +57,22 @@ void write_json(data_sink* sink,
 /**
  * @brief Normalize single quotes to double quotes using FST
  *
- * @param inbuf Input device buffer
+ * @param indata Input device buffer
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
-rmm::device_uvector<char> normalize_single_quotes(rmm::device_uvector<char>&& inbuf,
-                                                  rmm::cuda_stream_view stream,
-                                                  rmm::device_async_resource_ref mr);
+void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<char>>& indata,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr);
 
 /**
  * @brief Normalize unquoted whitespace (space and tab characters) using FST
  *
- * @param inbuf Input device buffer
+ * @param indata Input device buffer
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
-rmm::device_uvector<char> normalize_whitespace(rmm::device_uvector<char>&& inbuf,
-                                               rmm::cuda_stream_view stream,
-                                               rmm::device_async_resource_ref mr);
+void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<char>>& indata,
+                          rmm::cuda_stream_view stream,
+                          rmm::device_async_resource_ref mr);
 }  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index eb06ea0177e..ca56a12eb36 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -298,9 +298,9 @@ struct TransduceToNormalizedWS {
 
 namespace detail {
 
-rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT>&& inbuf,
-                                                     rmm::cuda_stream_view stream,
-                                                     rmm::device_async_resource_ref mr)
+void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<SymbolT>>& indata,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr)
 {
   auto parser = fst::detail::make_fst(
     fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs),
@@ -308,10 +308,10 @@ rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT
     fst::detail::make_translation_functor(normalize_quotes::TransduceToNormalizedQuotes{}),
     stream);
 
-  rmm::device_uvector<SymbolT> outbuf(inbuf.size() * 2, stream, mr);
+  rmm::device_uvector<SymbolT> outbuf(indata.size() * 2, stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
-  parser.Transduce(inbuf.data(),
-                   static_cast<SymbolOffsetT>(inbuf.size()),
+  parser.Transduce(indata.data(),
+                   static_cast<SymbolOffsetT>(indata.size()),
                    outbuf.data(),
                    thrust::make_discard_iterator(),
                    outbuf_size.data(),
@@ -319,12 +319,13 @@ rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT
                    stream);
 
   outbuf.resize(outbuf_size.value(stream), stream);
-  return outbuf;
+  datasource::owning_buffer<rmm::device_uvector<SymbolT>> outdata(std::move(outbuf));
+  std::swap(indata, outdata);
 }
 
-rmm::device_uvector<SymbolT> normalize_whitespace(rmm::device_uvector<SymbolT>&& inbuf,
-                                                  rmm::cuda_stream_view stream,
-                                                  rmm::device_async_resource_ref mr)
+void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<SymbolT>>& indata,
+                          rmm::cuda_stream_view stream,
+                          rmm::device_async_resource_ref mr)
 {
   auto parser = fst::detail::make_fst(
     fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
@@ -332,10 +333,10 @@ rmm::device_uvector<SymbolT> normalize_whitespace(rmm::device_uvector<SymbolT>&&
     fst::detail::make_translation_functor(normalize_whitespace::TransduceToNormalizedWS{}),
     stream);
 
-  rmm::device_uvector<SymbolT> outbuf(inbuf.size(), stream, mr);
+  rmm::device_uvector<SymbolT> outbuf(indata.size(), stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
-  parser.Transduce(inbuf.data(),
-                   static_cast<SymbolOffsetT>(inbuf.size()),
+  parser.Transduce(indata.data(),
+                   static_cast<SymbolOffsetT>(indata.size()),
                    outbuf.data(),
                    thrust::make_discard_iterator(),
                    outbuf_size.data(),
@@ -343,7 +344,8 @@ rmm::device_uvector<SymbolT> normalize_whitespace(rmm::device_uvector<SymbolT>&&
                    stream);
 
   outbuf.resize(outbuf_size.value(stream), stream);
-  return outbuf;
+  datasource::owning_buffer<rmm::device_uvector<SymbolT>> outdata(std::move(outbuf));
+  std::swap(indata, outdata);
 }
 
 }  // namespace detail
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 81ef3a51afc..89c301ec055 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -20,10 +20,13 @@
 #include "read_json.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
 
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
@@ -49,17 +52,20 @@ size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
 /**
  * @brief Read from array of data sources into RMM buffer
  *
+ * @param buffer Device span buffer to which data is read
  * @param sources Array of data sources
  * @param compression Compression format of source
  * @param range_offset Number of bytes to skip from source start
  * @param range_size Number of bytes to read from source
  * @param stream CUDA stream used for device memory operations and kernel launches
+ * @returns A subspan of the input device span containing data read
  */
-rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>> sources,
-                                           compression_type compression,
-                                           size_t range_offset,
-                                           size_t range_size,
-                                           rmm::cuda_stream_view stream)
+device_span<char> ingest_raw_input(device_span<char> buffer,
+                                   host_span<std::unique_ptr<datasource>> sources,
+                                   compression_type compression,
+                                   size_t range_offset,
+                                   size_t range_size,
+                                   rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   // We append a line delimiter between two files to make sure the last line of file i and the first
@@ -68,33 +74,43 @@ rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>
   auto constexpr num_delimiter_chars = 1;
   auto const num_extra_delimiters    = num_delimiter_chars * (sources.size() - 1);
 
-  // Iterate through the user defined sources and read the contents into the local buffer
-  auto const total_source_size =
-    sources_size(sources, range_offset, range_size) + num_extra_delimiters;
-
   if (compression == compression_type::NONE) {
     std::vector<size_type> delimiter_map{};
+    std::vector<size_t> prefsum_source_sizes(sources.size());
+    std::vector<std::unique_ptr<datasource::buffer>> h_buffers;
     delimiter_map.reserve(sources.size());
-    auto d_buffer     = rmm::device_uvector<char>(total_source_size, stream);
     size_t bytes_read = 0;
-    std::vector<std::unique_ptr<datasource::buffer>> h_buffers;
-    for (auto const& source : sources) {
-      if (!source->is_empty()) {
-        auto data_size   = (range_size != 0) ? range_size : source->size();
-        auto destination = reinterpret_cast<uint8_t*>(d_buffer.data()) + bytes_read;
-        if (source->is_device_read_preferred(data_size)) {
-          bytes_read += source->device_read(range_offset, data_size, destination, stream);
-        } else {
-          h_buffers.emplace_back(source->host_read(range_offset, data_size));
-          auto const& h_buffer = h_buffers.back();
-          CUDF_CUDA_TRY(cudaMemcpyAsync(
-            destination, h_buffer->data(), h_buffer->size(), cudaMemcpyDefault, stream.value()));
-          bytes_read += h_buffer->size();
-        }
-        delimiter_map.push_back(bytes_read);
-        bytes_read += num_delimiter_chars;
+    std::transform_inclusive_scan(sources.begin(),
+                                  sources.end(),
+                                  prefsum_source_sizes.begin(),
+                                  std::plus<int>{},
+                                  [](const std::unique_ptr<datasource>& s) { return s->size(); });
+    auto upper =
+      std::upper_bound(prefsum_source_sizes.begin(), prefsum_source_sizes.end(), range_offset);
+    size_t start_source = std::distance(prefsum_source_sizes.begin(), upper);
+
+    auto remaining_bytes_to_read = std::min(range_size, prefsum_source_sizes.back() - range_offset);
+    range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0;
+    for (size_t i = start_source; i < sources.size() && remaining_bytes_to_read; i++) {
+      if (sources[i]->is_empty()) continue;
+      auto data_size   = std::min(sources[i]->size() - range_offset, remaining_bytes_to_read);
+      auto destination = reinterpret_cast<uint8_t*>(buffer.data()) + bytes_read;
+      if (sources[i]->is_device_read_preferred(data_size)) {
+        bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream);
+      } else {
+        h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size));
+        auto const& h_buffer = h_buffers.back();
+        CUDF_CUDA_TRY(cudaMemcpyAsync(
+          destination, h_buffer->data(), h_buffer->size(), cudaMemcpyDefault, stream.value()));
+        bytes_read += h_buffer->size();
       }
+      range_offset = 0;
+      remaining_bytes_to_read -= bytes_read;
+      delimiter_map.push_back(bytes_read);
+      bytes_read += num_delimiter_chars;
     }
+    // In the case where all sources are empty, bytes_read is zero
+    if (bytes_read) bytes_read -= num_delimiter_chars;
 
     // If this is a multi-file source, we scatter the JSON line delimiters between files
     if (sources.size() > 1) {
@@ -109,23 +125,25 @@ rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>
                       delimiter_source,
                       delimiter_source + d_delimiter_map.size(),
                       d_delimiter_map.data(),
-                      d_buffer.data());
+                      buffer.data());
     }
-
     stream.synchronize();
-    return d_buffer;
-
-  } else {
-    auto buffer = std::vector<uint8_t>(total_source_size);
-    // Single read because only a single compressed source is supported
-    // Reading to host because decompression of a single block is much faster on the CPU
-    sources[0]->host_read(range_offset, total_source_size, buffer.data());
-    auto const uncomp_data = decompress(compression, buffer);
-    return cudf::detail::make_device_uvector_sync(
-      host_span<char const>{reinterpret_cast<char const*>(uncomp_data.data()), uncomp_data.size()},
-      stream,
-      rmm::mr::get_current_device_resource());
+    return buffer.first(bytes_read);
   }
+  // TODO: allow byte range reading from multiple compressed files.
+  auto remaining_bytes_to_read = std::min(range_size, sources[0]->size() - range_offset);
+  auto hbuffer                 = std::vector<uint8_t>(remaining_bytes_to_read);
+  // Single read because only a single compressed source is supported
+  // Reading to host because decompression of a single block is much faster on the CPU
+  sources[0]->host_read(range_offset, remaining_bytes_to_read, hbuffer.data());
+  auto uncomp_data = decompress(compression, hbuffer);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(buffer.data(),
+                                reinterpret_cast<char*>(uncomp_data.data()),
+                                uncomp_data.size() * sizeof(char),
+                                cudaMemcpyHostToDevice,
+                                stream.value()));
+  stream.synchronize();
+  return buffer.first(uncomp_data.size());
 }
 
 size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::datasource>> sources,
@@ -133,21 +151,19 @@ size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::data
                                         char const delimiter,
                                         rmm::cuda_stream_view stream)
 {
-  auto const buffer = ingest_raw_input(sources,
-                                       reader_opts.get_compression(),
-                                       reader_opts.get_byte_range_offset(),
-                                       reader_opts.get_byte_range_size(),
-                                       stream);
+  auto const total_source_size =
+    sources_size(sources, reader_opts.get_byte_range_offset(), reader_opts.get_byte_range_size()) +
+    (sources.size() - 1);
+  rmm::device_uvector<char> buffer(total_source_size, stream);
+  ingest_raw_input(buffer,
+                   sources,
+                   reader_opts.get_compression(),
+                   reader_opts.get_byte_range_offset(),
+                   reader_opts.get_byte_range_size(),
+                   stream);
   return find_first_delimiter(buffer, delimiter, stream);
 }
 
-bool should_load_whole_source(json_reader_options const& opts, size_t source_size)
-{
-  auto const range_offset = opts.get_byte_range_offset();
-  auto const range_size   = opts.get_byte_range_size();
-  return range_offset == 0 and (range_size == 0 or range_size >= source_size);
-}
-
 /**
  * @brief Get the byte range between record starts and ends starting from the given range.
  *
@@ -159,48 +175,90 @@ bool should_load_whole_source(json_reader_options const& opts, size_t source_siz
  * @param sources Data sources to read from
  * @param reader_opts JSON reader options with range offset and range size
  * @param stream CUDA stream used for device memory operations and kernel launches
- * @return Byte range for parsing
+ * @returns Data source owning buffer enclosing the bytes read
  */
-auto get_record_range_raw_input(host_span<std::unique_ptr<datasource>> sources,
-                                json_reader_options const& reader_opts,
-                                rmm::cuda_stream_view stream)
+datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
+  host_span<std::unique_ptr<datasource>> sources,
+  json_reader_options const& reader_opts,
+  rmm::cuda_stream_view stream)
 {
-  auto buffer = ingest_raw_input(sources,
-                                 reader_opts.get_compression(),
-                                 reader_opts.get_byte_range_offset(),
-                                 reader_opts.get_byte_range_size(),
-                                 stream);
-  if (should_load_whole_source(reader_opts, sources[0]->size())) return buffer;
-  auto first_delim_pos =
-    reader_opts.get_byte_range_offset() == 0 ? 0 : find_first_delimiter(buffer, '\n', stream);
+  CUDF_FUNC_RANGE();
+  auto geometric_mean = [](double a, double b) { return std::sqrt(a * b); };
+
+  size_t const total_source_size            = sources_size(sources, 0, 0);
+  auto constexpr num_delimiter_chars        = 1;
+  auto const num_extra_delimiters           = num_delimiter_chars * (sources.size() - 1);
+  compression_type const reader_compression = reader_opts.get_compression();
+  size_t const chunk_offset                 = reader_opts.get_byte_range_offset();
+  size_t chunk_size                         = reader_opts.get_byte_range_size();
+
+  CUDF_EXPECTS(total_source_size ? chunk_offset < total_source_size : !chunk_offset,
+               "Invalid offsetting");
+  auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset;
+  chunk_size =
+    should_load_all_sources ? total_source_size - chunk_offset + num_extra_delimiters : chunk_size;
+
+  // Some magic numbers
+  constexpr int num_subchunks               = 10;  // per chunk_size
+  constexpr size_t min_subchunk_size        = 10000;
+  int const num_subchunks_prealloced        = should_load_all_sources ? 0 : 3;
+  constexpr int estimated_compression_ratio = 4;
+
+  // NOTE: heuristic for choosing subchunk size: geometric mean of minimum subchunk size (set to
+  // 10kb) and the byte range size
+
+  size_t const size_per_subchunk =
+    geometric_mean(std::ceil((double)chunk_size / num_subchunks), min_subchunk_size);
+
+  // The allocation for single source compressed input is estimated by assuming a ~4:1
+  // compression ratio. For uncompressed inputs, we can getter a better estimate using the idea
+  // of subchunks.
+  auto constexpr header_size = 4096;
+  size_t const buffer_size =
+    reader_compression != compression_type::NONE
+      ? total_source_size * estimated_compression_ratio + header_size
+      : std::min(total_source_size, chunk_size + num_subchunks_prealloced * size_per_subchunk);
+  rmm::device_uvector<char> buffer(buffer_size, stream);
+  device_span<char> bufspan(buffer);
+
+  // Offset within buffer indicating first read position
+  std::int64_t buffer_offset = 0;
+  auto readbufspan =
+    ingest_raw_input(bufspan, sources, reader_compression, chunk_offset, chunk_size, stream);
+
+  auto const shift_for_nonzero_offset = std::min<std::int64_t>(chunk_offset, 1);
+  auto const first_delim_pos =
+    chunk_offset == 0 ? 0 : find_first_delimiter(readbufspan, '\n', stream);
   if (first_delim_pos == -1) {
-    return rmm::device_uvector<char>{0, stream};
-  } else {
-    first_delim_pos = first_delim_pos + reader_opts.get_byte_range_offset();
+    // return empty owning datasource buffer
+    auto empty_buf = rmm::device_uvector<char>(0, stream);
+    return datasource::owning_buffer<rmm::device_uvector<char>>(std::move(empty_buf));
+  } else if (!should_load_all_sources) {
     // Find next delimiter
-    decltype(first_delim_pos) next_delim_pos = -1;
-    auto const total_source_size             = sources_size(sources, 0, 0);
-    auto current_offset = reader_opts.get_byte_range_offset() + reader_opts.get_byte_range_size();
-    while (current_offset < total_source_size and next_delim_pos == -1) {
-      buffer         = ingest_raw_input(sources,
-                                reader_opts.get_compression(),
-                                current_offset,
-                                reader_opts.get_byte_range_size(),
-                                stream);
-      next_delim_pos = find_first_delimiter(buffer, '\n', stream);
-      if (next_delim_pos == -1) { current_offset += reader_opts.get_byte_range_size(); }
+    std::int64_t next_delim_pos = -1;
+    size_t next_subchunk_start  = chunk_offset + chunk_size;
+    while (next_subchunk_start < total_source_size && next_delim_pos < buffer_offset) {
+      buffer_offset += readbufspan.size();
+      readbufspan    = ingest_raw_input(bufspan.last(buffer_size - buffer_offset),
+                                     sources,
+                                     reader_compression,
+                                     next_subchunk_start,
+                                     size_per_subchunk,
+                                     stream);
+      next_delim_pos = find_first_delimiter(readbufspan, '\n', stream) + buffer_offset;
+      if (next_delim_pos < buffer_offset) { next_subchunk_start += size_per_subchunk; }
     }
-    if (next_delim_pos == -1) {
-      next_delim_pos = total_source_size;
-    } else {
-      next_delim_pos = next_delim_pos + current_offset;
-    }
-    return ingest_raw_input(sources,
-                            reader_opts.get_compression(),
-                            first_delim_pos,
-                            next_delim_pos - first_delim_pos,
-                            stream);
+    if (next_delim_pos < buffer_offset) next_delim_pos = buffer_offset + readbufspan.size();
+
+    return datasource::owning_buffer<rmm::device_uvector<char>>(
+      std::move(buffer),
+      reinterpret_cast<uint8_t*>(buffer.data()) + first_delim_pos + shift_for_nonzero_offset,
+      next_delim_pos - first_delim_pos - shift_for_nonzero_offset);
   }
+  return datasource::owning_buffer<rmm::device_uvector<char>>(
+    std::move(buffer),
+    reinterpret_cast<uint8_t*>(buffer.data()) + first_delim_pos + shift_for_nonzero_offset,
+    readbufspan.size() - first_delim_pos - shift_for_nonzero_offset);
 }
 
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
@@ -221,8 +279,6 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
   if (reader_opts.get_byte_range_offset() != 0 or reader_opts.get_byte_range_size() != 0) {
     CUDF_EXPECTS(reader_opts.is_enabled_lines(),
                  "Specifying a byte range is supported only for JSON Lines");
-    CUDF_EXPECTS(sources.size() == 1,
-                 "Specifying a byte range is supported only for a single source");
   }
 
   if (sources.size() > 1) {
@@ -232,22 +288,24 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                  "Multiple inputs are supported only for JSON Lines format");
   }
 
-  auto buffer = get_record_range_raw_input(sources, reader_opts, stream);
+  datasource::owning_buffer<rmm::device_uvector<char>> bufview =
+    get_record_range_raw_input(sources, reader_opts, stream);
 
   // If input JSON buffer has single quotes and option to normalize single quotes is enabled,
   // invoke pre-processing FST
   if (reader_opts.is_enabled_normalize_single_quotes()) {
-    buffer =
-      normalize_single_quotes(std::move(buffer), stream, rmm::mr::get_current_device_resource());
+    normalize_single_quotes(bufview, stream, rmm::mr::get_current_device_resource());
   }
 
   // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is
   // enabled, invoke pre-processing FST
   if (reader_opts.is_enabled_normalize_whitespace()) {
-    buffer =
-      normalize_whitespace(std::move(buffer), stream, rmm::mr::get_current_device_resource());
+    normalize_whitespace(bufview, stream, rmm::mr::get_current_device_resource());
   }
 
+  auto buffer =
+    cudf::device_span<char const>(reinterpret_cast<char const*>(bufview.data()), bufview.size());
+  stream.synchronize();
   return device_parse_nested_json(buffer, reader_opts, stream, mr);
   // For debug purposes, use host_parse_nested_json()
 }
diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp
index 593c8136e6a..5260b435482 100644
--- a/cpp/tests/io/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json_quote_normalization_test.cpp
@@ -20,6 +20,8 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/io/types.hpp>
@@ -39,23 +41,22 @@ void run_test(const std::string& host_input, const std::string& expected_host_ou
   std::shared_ptr<rmm::mr::device_memory_resource> rsc =
     std::make_shared<rmm::mr::cuda_memory_resource>();
 
-  rmm::device_uvector<char> device_input(
-    host_input.size(), cudf::test::get_default_stream(), rsc.get());
-  CUDF_CUDA_TRY(cudaMemcpyAsync(device_input.data(),
-                                host_input.data(),
-                                host_input.size(),
-                                cudaMemcpyHostToDevice,
-                                cudf::test::get_default_stream().value()));
+  auto stream_view  = cudf::test::get_default_stream();
+  auto device_input = cudf::detail::make_device_uvector_async(
+    host_input, stream_view, rmm::mr::get_current_device_resource());
+
   // Preprocessing FST
-  auto device_fst_output = cudf::io::json::detail::normalize_single_quotes(
-    std::move(device_input), cudf::test::get_default_stream(), rsc.get());
+  cudf::io::datasource::owning_buffer<rmm::device_uvector<char>> device_data(
+    std::move(device_input));
+  cudf::io::json::detail::normalize_single_quotes(device_data, stream_view, rsc.get());
 
-  std::string preprocessed_host_output(device_fst_output.size(), 0);
+  std::string preprocessed_host_output(device_data.size(), 0);
   CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(),
-                                device_fst_output.data(),
+                                device_data.data(),
                                 preprocessed_host_output.size(),
                                 cudaMemcpyDeviceToHost,
-                                cudf::test::get_default_stream().value()));
+                                stream_view.value()))
+  stream_view.synchronize();
   CUDF_TEST_EXPECT_VECTOR_EQUAL(
     preprocessed_host_output, expected_host_output, preprocessed_host_output.size());
 }
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index ee1207f04a2..f0f72d4e794 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -681,6 +681,111 @@ TEST_F(JsonReaderTest, JsonLinesByteRange)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int64_wrapper{{3000, 4000, 5000}});
 }
 
+TEST_F(JsonReaderTest, JsonLinesMultipleFilesByteRange_AcrossFiles)
+{
+  const std::string file1 = temp_env->get_temp_dir() + "JsonLinesMultipleFilesByteRangeTest1.json";
+  std::ofstream outfile1(file1, std::ofstream::out);
+  outfile1 << "[1000]\n[2000]\n[3000]\n[4000]\n[5000]\n[6000]\n[7000]\n[8000]\n[9000]";
+  outfile1.close();
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file1}})
+      .lines(true)
+      .byte_range_offset(11)
+      .byte_range_size(70);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 1);
+  EXPECT_EQ(result.tbl->num_rows(), 10);
+
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.metadata.schema_info[0].name, "0");
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(0),
+    int64_wrapper{{3000, 4000, 5000, 6000, 7000, 8000, 9000, 1000, 2000, 3000}});
+}
+
+TEST_F(JsonReaderTest, JsonLinesMultipleFilesByteRange_ExcessRangeSize)
+{
+  const std::string file1 = temp_env->get_temp_dir() + "JsonLinesMultipleFilesByteRangeTest1.json";
+  std::ofstream outfile1(file1, std::ofstream::out);
+  outfile1 << "[1000]\n[2000]\n[3000]\n[4000]\n[5000]\n[6000]\n[7000]\n[8000]\n[9000]";
+  outfile1.close();
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file1}})
+      .lines(true)
+      .byte_range_offset(11)
+      .byte_range_size(1000);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 1);
+  EXPECT_EQ(result.tbl->num_rows(), 16);
+
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.metadata.schema_info[0].name, "0");
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
+                                 int64_wrapper{{3000,
+                                                4000,
+                                                5000,
+                                                6000,
+                                                7000,
+                                                8000,
+                                                9000,
+                                                1000,
+                                                2000,
+                                                3000,
+                                                4000,
+                                                5000,
+                                                6000,
+                                                7000,
+                                                8000,
+                                                9000}});
+}
+
+TEST_F(JsonReaderTest, JsonLinesMultipleFilesByteRange_LoadAllFiles)
+{
+  const std::string file1 = temp_env->get_temp_dir() + "JsonLinesMultipleFilesByteRangeTest1.json";
+  std::ofstream outfile1(file1, std::ofstream::out);
+  outfile1 << "[1000]\n[2000]\n[3000]\n[4000]\n[5000]\n[6000]\n[7000]\n[8000]\n[9000]";
+  outfile1.close();
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file1}}).lines(true);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 1);
+  EXPECT_EQ(result.tbl->num_rows(), 18);
+
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.metadata.schema_info[0].name, "0");
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
+                                 int64_wrapper{{1000,
+                                                2000,
+                                                3000,
+                                                4000,
+                                                5000,
+                                                6000,
+                                                7000,
+                                                8000,
+                                                9000,
+                                                1000,
+                                                2000,
+                                                3000,
+                                                4000,
+                                                5000,
+                                                6000,
+                                                7000,
+                                                8000,
+                                                9000}});
+}
+
 TEST_P(JsonReaderRecordTest, JsonLinesObjects)
 {
   const std::string fname = temp_env->get_temp_dir() + "JsonLinesObjectsTest.json";
diff --git a/cpp/tests/io/json_whitespace_normalization_test.cu b/cpp/tests/io/json_whitespace_normalization_test.cu
index 336d360063f..8ed5fa81b12 100644
--- a/cpp/tests/io/json_whitespace_normalization_test.cu
+++ b/cpp/tests/io/json_whitespace_normalization_test.cu
@@ -19,6 +19,7 @@
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/types.hpp>
@@ -34,17 +35,26 @@ struct JsonWSNormalizationTest : public cudf::test::BaseFixture {};
 
 void run_test(std::string const& host_input, std::string const& expected_host_output)
 {
-  auto stream_view  = cudf::get_default_stream();
+  // Prepare cuda stream for data transfers & kernels
+  auto stream_view = cudf::test::get_default_stream();
+
   auto device_input = cudf::detail::make_device_uvector_async(
     host_input, stream_view, rmm::mr::get_current_device_resource());
 
   // Preprocessing FST
-  auto device_fst_output = cudf::io::json::detail::normalize_whitespace(
-    std::move(device_input), stream_view, rmm::mr::get_current_device_resource());
-
-  auto const preprocessed_host_output =
-    cudf::detail::make_std_vector_sync(device_fst_output, stream_view);
-
+  cudf::io::datasource::owning_buffer<rmm::device_uvector<char>> device_data(
+    std::move(device_input));
+  cudf::io::json::detail::normalize_whitespace(
+    device_data, stream_view, rmm::mr::get_current_device_resource());
+
+  std::string preprocessed_host_output(device_data.size(), 0);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(),
+                                device_data.data(),
+                                preprocessed_host_output.size(),
+                                cudaMemcpyDeviceToHost,
+                                stream_view.value()));
+
+  stream_view.synchronize();
   ASSERT_EQ(preprocessed_host_output.size(), expected_host_output.size());
   CUDF_TEST_EXPECT_VECTOR_EQUAL(
     preprocessed_host_output, expected_host_output, preprocessed_host_output.size());

From f4ec1a49e8f04305c324cc03e5f8fbc275bf5c88 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 30 Apr 2024 13:37:20 -0500
Subject: [PATCH 135/272] Remove jni-docker-build workflow (#15619)

This PR removes `jni-docker-build.yml`, which is an unused workflow according to the Spark team.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Tim Liu (https://github.com/NvTimLiu)

URL: https://github.com/rapidsai/cudf/pull/15619
---
 .github/workflows/jni-docker-build.yml | 53 --------------------------
 1 file changed, 53 deletions(-)
 delete mode 100644 .github/workflows/jni-docker-build.yml

diff --git a/.github/workflows/jni-docker-build.yml b/.github/workflows/jni-docker-build.yml
deleted file mode 100644
index 0bdc409d0ab..00000000000
--- a/.github/workflows/jni-docker-build.yml
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name: JNI Docker Build
-
-on:
-  workflow_dispatch: # manual trigger only
-
-concurrency:
-  group: jni-docker-build-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  docker-build:
-    if: github.repository == 'rapidsai/cudf'
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v2
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
-
-      - name: Login to DockerHub
-        uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.GPUCIBOT_DOCKERHUB_USER }}
-          password: ${{ secrets.GPUCIBOT_DOCKERHUB_TOKEN }}
-
-      - name: Set ENVs
-        run: |
-          echo "IMAGE_NAME=rapidsai/cudf-jni-build" >> $GITHUB_ENV
-          echo "IMAGE_REF=${GITHUB_REF_NAME}" >> $GITHUB_ENV
-
-      - name: Build and Push
-        uses: docker/build-push-action@v3
-        with:
-          push: true
-          file: java/ci/Dockerfile.centos7
-          tags: "${{ env.IMAGE_NAME }}:${{ env.IMAGE_REF }}"

From 1fd3db8b662c61b4fb04e4be07cf6ac737cef8a1 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 30 Apr 2024 15:36:26 -0400
Subject: [PATCH 136/272] Use experimental make_strings_children for strings
 replace/filter/translate (#15586)

Updates strings replace functions to use the new experimental `make_strings_children` which supports building large strings.

Reference #15579

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/15586
---
 cpp/src/strings/char_types/char_types.cu | 13 +++++++------
 cpp/src/strings/filter_chars.cu          | 11 ++++++-----
 cpp/src/strings/replace/multi.cu         | 15 +++++++++------
 cpp/src/strings/replace/replace.cu       | 11 ++++++-----
 cpp/src/strings/replace/replace_slice.cu | 11 ++++++-----
 cpp/src/strings/translate.cu             | 11 ++++++-----
 6 files changed, 40 insertions(+), 32 deletions(-)

diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index 28068cf7e78..7716cf0cc29 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -21,7 +21,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/char_types/char_types.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -130,8 +130,9 @@ struct filter_chars_fn {
   string_character_types const types_to_remove;
   string_character_types const types_to_keep;
   string_view const d_replacement;  ///< optional replacement for removed characters
-  int32_t* d_offsets{};             ///< size of the output string stored here during first pass
-  char* d_chars{};                  ///< this is null only during the first pass
+  size_type* d_sizes{};
+  char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Returns true if the given character should be replaced.
@@ -150,7 +151,7 @@ struct filter_chars_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str  = d_column.element<string_view>(idx);
@@ -165,7 +166,7 @@ struct filter_chars_fn {
       nbytes += d_newchar.size_bytes() - char_size;
       if (out_ptr) out_ptr = cudf::strings::detail::copy_string(out_ptr, d_newchar);
     }
-    if (!out_ptr) d_offsets[idx] = nbytes;
+    if (!out_ptr) { d_sizes[idx] = nbytes; }
   }
 };
 
@@ -202,7 +203,7 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
 
   // this utility calls filterer to build the offsets and chars columns
   auto [offsets_column, chars] =
-    cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);
+    cudf::strings::detail::experimental::make_strings_children(filterer, strings_count, stream, mr);
 
   // return new strings column
   return make_strings_column(strings_count,
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index 32717dac78d..4705ae519cd 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -57,8 +57,9 @@ struct filter_fn {
   rmm::device_uvector<char_range>::iterator table_begin;
   rmm::device_uvector<char_range>::iterator table_end;
   string_view const d_replacement;
-  int32_t* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Return true if this character should be removed.
@@ -87,7 +88,7 @@ struct filter_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str = d_strings.element<string_view>(idx);
@@ -104,7 +105,7 @@ struct filter_fn {
       else
         nbytes += d_newchar.size_bytes() - char_size;
     }
-    if (!out_ptr) d_offsets[idx] = nbytes;
+    if (!out_ptr) { d_sizes[idx] = nbytes; }
   }
 };
 
@@ -141,7 +142,7 @@ std::unique_ptr<column> filter_characters(
   // this utility calls the strip_fn to build the offsets and chars columns
   filter_fn ffn{*d_strings, keep_characters, table.begin(), table.end(), d_replacement};
   auto [offsets_column, chars] =
-    cudf::strings::detail::make_strings_children(ffn, strings.size(), stream, mr);
+    cudf::strings::detail::experimental::make_strings_children(ffn, strings.size(), stream, mr);
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 2eb03bd10a4..9abcca7a5e6 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -23,7 +23,7 @@
 #include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/replace.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/replace.hpp>
@@ -404,13 +404,14 @@ struct replace_multi_fn {
   column_device_view const d_strings;
   column_device_view const d_targets;
   column_device_view const d_repls;
-  int32_t* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str   = d_strings.element<string_view>(idx);
@@ -443,9 +444,11 @@ struct replace_multi_fn {
       ++spos;
     }
     if (out_ptr)  // copy remainder
+    {
       memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
-    else
-      d_offsets[idx] = bytes;
+    } else {
+      d_sizes[idx] = bytes;
+    }
   }
 };
 
@@ -459,7 +462,7 @@ std::unique_ptr<column> replace_string_parallel(strings_column_view const& input
   auto d_targets      = column_device_view::create(targets.parent(), stream);
   auto d_replacements = column_device_view::create(repls.parent(), stream);
 
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     replace_multi_fn{*d_strings, *d_targets, *d_replacements}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index 857bc7fb41c..df8526fa942 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/strings/detail/replace.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/replace.hpp>
@@ -345,13 +345,14 @@ struct replace_fn {
   string_view d_target;
   string_view d_replacement;
   cudf::size_type maxrepl;
-  cudf::size_type* d_offsets{};
+  cudf::size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str   = d_strings.element<string_view>(idx);
@@ -384,7 +385,7 @@ struct replace_fn {
     if (out_ptr) {  // copy remainder
       memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
     } else {
-      d_offsets[idx] = bytes;
+      d_sizes[idx] = bytes;
     }
   }
 };
@@ -398,7 +399,7 @@ std::unique_ptr<column> replace_string_parallel(strings_column_view const& input
 {
   auto d_strings = column_device_view::create(input.parent(), stream);
 
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     replace_fn{*d_strings, d_target, d_replacement, maxrepl}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
diff --git a/cpp/src/strings/replace/replace_slice.cu b/cpp/src/strings/replace/replace_slice.cu
index 90540b39189..54e84dfe504 100644
--- a/cpp/src/strings/replace/replace_slice.cu
+++ b/cpp/src/strings/replace/replace_slice.cu
@@ -19,7 +19,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/replace.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/replace.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -45,13 +45,14 @@ struct replace_slice_fn {
   string_view const d_repl;
   size_type const start;
   size_type const stop;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str   = d_strings.element<string_view>(idx);
@@ -69,7 +70,7 @@ struct replace_slice_fn {
                                    in_ptr + end,
                                    d_str.size_bytes() - end);
     } else {
-      d_offsets[idx] = d_str.size_bytes() + d_repl.size_bytes() - (end - begin);
+      d_sizes[idx] = d_str.size_bytes() + d_repl.size_bytes() - (end - begin);
     }
   }
 };
@@ -94,7 +95,7 @@ std::unique_ptr<column> replace_slice(strings_column_view const& input,
   auto d_strings = column_device_view::create(input.parent(), stream);
 
   // this utility calls the given functor to build the offsets and chars columns
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     replace_slice_fn{*d_strings, d_repl, start, stop}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index fcf55429e09..75bc46d30c4 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -20,7 +20,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/translate.hpp>
@@ -52,13 +52,14 @@ struct translate_fn {
   column_device_view const d_strings;
   rmm::device_uvector<translate_table>::iterator table_begin;
   rmm::device_uvector<translate_table>::iterator table_end;
-  int32_t* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     string_view const d_str = d_strings.element<string_view>(idx);
@@ -80,7 +81,7 @@ struct translate_fn {
       }
       if (chr && out_ptr) out_ptr += from_char_utf8(chr, out_ptr);
     }
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -111,7 +112,7 @@ std::unique_ptr<column> translate(strings_column_view const& strings,
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
-  auto [offsets_column, chars] = make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     translate_fn{*d_strings, table.begin(), table.end()}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),

From 2439dee4a3c0744e0169ff8dc0c0354e285db58b Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 30 Apr 2024 15:37:01 -0400
Subject: [PATCH 137/272] Use experimental make_strings_children for strings
 join/url_encode/slice (#15598)

Updates strings APIs to use the new experimental `make_strings_children` which supports building large strings.
- `cudf::strings::join_strings`
- `cudf::strings::join_list_elements`
- `cudf::strings::slice_strings`
- `cudf::strings::format_list_column`
- `cudf::strings::url_encode`

Reference https://github.com/rapidsai/cudf/issues/15579

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/15598
---
 cpp/src/strings/combine/join.cu               |  8 +++++---
 cpp/src/strings/combine/join_list_elements.cu | 13 +++++++------
 cpp/src/strings/convert/convert_lists.cu      |  9 +++++----
 cpp/src/strings/convert/convert_urls.cu       | 13 +++++++------
 cpp/src/strings/slice.cu                      | 13 +++++++------
 5 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index d1d9afbb85f..4b2996a77e4 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -22,6 +22,7 @@
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/detail/combine.hpp>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -84,8 +85,9 @@ struct join_base_fn {
  * This functor is suitable for make_strings_children
  */
 struct join_fn : public join_base_fn {
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   join_fn(column_device_view const d_strings,
           string_view d_separator,
@@ -106,7 +108,7 @@ struct join_fn : public join_base_fn {
     } else {
       bytes += d_str.size_bytes() + d_sep.size_bytes();
     }
-    if (!d_chars) { d_offsets[idx] = bytes; }
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -148,7 +150,7 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
     if ((input.size() == input.null_count()) ||
         ((input.chars_size(stream) / (input.size() - input.null_count())) <=
          AVG_CHAR_BYTES_THRESHOLD)) {
-      return std::get<1>(make_strings_children(
+      return std::get<1>(experimental::make_strings_children(
                            join_fn{*d_strings, d_separator, d_narep}, input.size(), stream, mr))
         .release();
     }
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
index a54ea5263fe..b0073452741 100644
--- a/cpp/src/strings/combine/join_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -22,7 +22,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/combine.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -60,11 +60,12 @@ struct compute_size_and_concatenate_fn {
   separator_on_nulls const separate_nulls;
   output_if_empty_list const empty_list_policy;
 
-  size_type* d_offsets{nullptr};
+  size_type* d_sizes{nullptr};
 
   // If d_chars == nullptr: only compute sizes and validities of the output strings.
   // If d_chars != nullptr: only concatenate strings.
   char* d_chars{nullptr};
+  cudf::detail::input_offsetalator d_offsets;
 
   [[nodiscard]] __device__ bool output_is_null(size_type const idx,
                                                size_type const start_idx,
@@ -84,7 +85,7 @@ struct compute_size_and_concatenate_fn {
     auto const end_idx   = list_offsets[idx + 1];
 
     if (!d_chars && output_is_null(idx, start_idx, end_idx)) {
-      d_offsets[idx] = 0;
+      d_sizes[idx] = 0;
       return;
     }
 
@@ -120,7 +121,7 @@ struct compute_size_and_concatenate_fn {
 
     // If there are all null elements, the output should be the same as having an empty list input:
     // a null or an empty string
-    if (!d_chars) { d_offsets[idx] = has_valid_element ? size_bytes : 0; }
+    if (!d_chars) { d_sizes[idx] = has_valid_element ? size_bytes : 0; }
   }
 };
 
@@ -208,7 +209,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                                     separate_nulls,
                                                     empty_list_policy};
 
-  auto [offsets_column, chars] = make_strings_children(comp_fn, num_rows, stream, mr);
+  auto [offsets_column, chars] = experimental::make_strings_children(comp_fn, num_rows, stream, mr);
   auto [null_mask, null_count] =
     cudf::detail::valid_if(thrust::counting_iterator<size_type>(0),
                            thrust::counting_iterator<size_type>(num_rows),
@@ -283,7 +284,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                                     separate_nulls,
                                                     empty_list_policy};
 
-  auto [offsets_column, chars] = make_strings_children(comp_fn, num_rows, stream, mr);
+  auto [offsets_column, chars] = experimental::make_strings_children(comp_fn, num_rows, stream, mr);
   auto [null_mask, null_count] =
     cudf::detail::valid_if(thrust::counting_iterator<size_type>(0),
                            thrust::counting_iterator<size_type>(num_rows),
diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu
index ed898bd6f72..198e6c11ef3 100644
--- a/cpp/src/strings/convert/convert_lists.cu
+++ b/cpp/src/strings/convert/convert_lists.cu
@@ -17,7 +17,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_lists.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
@@ -66,8 +66,9 @@ struct format_lists_fn {
   string_view const d_na_rep;
   stack_item* d_stack;
   size_type const max_depth;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ column_device_view get_nested_child(size_type idx)
   {
@@ -184,7 +185,7 @@ struct format_lists_fn {
       }
     }
 
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -217,7 +218,7 @@ std::unique_ptr<column> format_list_column(lists_column_view const& input,
   auto const d_separators = column_device_view::create(separators.parent(), stream);
   auto const d_na_rep     = na_rep.value(stream);
 
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = experimental::make_strings_children(
     format_lists_fn{*d_input, *d_separators, d_na_rep, stack_buffer.data(), depth},
     input.size(),
     stream,
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index 644ffbb4bd1..459c3e88a4e 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/strings/convert/convert_urls.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -50,8 +50,9 @@ namespace {
 //
 struct url_encoder_fn {
   column_device_view const d_strings;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   // utility to create 2-byte hex characters from single binary byte
   __device__ void byte_to_hex(uint8_t byte, char* hex)
@@ -80,7 +81,7 @@ struct url_encoder_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -117,7 +118,7 @@ struct url_encoder_fn {
         }
       }
     }
-    if (!d_chars) d_offsets[idx] = nbytes;
+    if (!d_chars) { d_sizes[idx] = nbytes; }
   }
 };
 
@@ -132,8 +133,8 @@ std::unique_ptr<column> url_encode(strings_column_view const& input,
 
   auto d_column = column_device_view::create(input.parent(), stream);
 
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
-    url_encoder_fn{*d_column}, input.size(), stream, mr);
+  auto [offsets_column, chars] =
+    experimental::make_strings_children(url_encoder_fn{*d_column}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
                              std::move(offsets_column),
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index d080065b330..2f7564b3b0d 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -21,7 +21,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/slice.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -79,19 +79,20 @@ struct substring_fn {
   numeric_scalar_device_view<size_type> const d_start;
   numeric_scalar_device_view<size_type> const d_stop;
   numeric_scalar_device_view<size_type> const d_step;
-  int32_t* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str  = d_column.template element<string_view>(idx);
     auto const length = d_str.length();
     if (length == 0) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     size_type const step = d_step.is_valid() ? d_step.value() : 1;
@@ -131,7 +132,7 @@ struct substring_fn {
       }
       itr += step;
     }
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -205,7 +206,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
   auto const d_stop  = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(stop));
   auto const d_step  = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(step));
 
-  auto [offsets, chars] = make_strings_children(
+  auto [offsets, chars] = experimental::make_strings_children(
     substring_fn{*d_column, d_start, d_stop, d_step}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),

From 4da6fda3e6042645b8e21c931b26966ef0fa8897 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 30 Apr 2024 15:37:48 -0400
Subject: [PATCH 138/272] Use experimental make_strings_children for
 capitalize/case/pad functions (#15587)

Updates strings case conversion and pad functions to use the new experimental `make_strings_children` which supports building large strings.

Reference https://github.com/rapidsai/cudf/issues/15579

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15587
---
 cpp/src/strings/capitalize.cu | 11 ++++++-----
 cpp/src/strings/case.cu       | 17 +++++++++--------
 cpp/src/strings/padding.cu    | 17 +++++++++--------
 3 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu
index 2bb85bf2c5c..031fff4086a 100644
--- a/cpp/src/strings/capitalize.cu
+++ b/cpp/src/strings/capitalize.cu
@@ -20,7 +20,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/capitalize.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -64,8 +64,9 @@ struct base_fn {
   character_cases_table_type const* d_case_table;
   special_case_mapping const* d_special_case_mapping;
   column_device_view const d_column;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   base_fn(column_device_view const& d_column)
     : d_flags(get_character_flags_table()),
@@ -108,7 +109,7 @@ struct base_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -137,7 +138,7 @@ struct base_fn {
       // capitalize the next char if this one is a delimiter
       capitalize = derived.capitalize_next(chr, flag);
     }
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -231,7 +232,7 @@ std::unique_ptr<column> capitalizer(CapitalFn cfn,
                                     rmm::device_async_resource_ref mr)
 {
   auto [offsets_column, chars] =
-    cudf::strings::detail::make_strings_children(cfn, input.size(), stream, mr);
+    cudf::strings::detail::experimental::make_strings_children(cfn, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
                              std::move(offsets_column),
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 82b590f81b3..5d5e6ba9a3e 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -23,7 +23,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/case.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -117,8 +117,9 @@ struct convert_char_fn {
  */
 struct base_upper_lower_fn {
   convert_char_fn converter;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   base_upper_lower_fn(convert_char_fn converter) : converter(converter) {}
 
@@ -137,7 +138,7 @@ struct base_upper_lower_fn {
         bytes += size;
       }
     }
-    if (!d_buffer) { d_offsets[idx] = bytes; }
+    if (!d_buffer) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -152,7 +153,7 @@ struct upper_lower_fn : public base_upper_lower_fn {
   __device__ void operator()(size_type idx) const
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str = d_strings.element<string_view>(idx);
@@ -295,8 +296,8 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
 
   // For smaller strings, use the regular string-parallel algorithm
   if ((chars_size / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
-    auto [offsets, chars] =
-      cudf::strings::detail::make_strings_children(converter, input.size(), stream, mr);
+    auto [offsets, chars] = cudf::strings::detail::experimental::make_strings_children(
+      converter, input.size(), stream, mr);
     return make_strings_column(input.size(),
                                std::move(offsets),
                                chars.release(),
@@ -364,8 +365,8 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
   // run case conversion over the new sub-strings
   auto const tmp_size = static_cast<size_type>(tmp_offsets.size()) - 1;
   upper_lower_ls_fn sub_conv{ccfn, input_chars, tmp_offsets.data()};
-  auto chars =
-    std::get<1>(cudf::strings::detail::make_strings_children(sub_conv, tmp_size, stream, mr));
+  auto chars = std::get<1>(
+    cudf::strings::detail::experimental::make_strings_children(sub_conv, tmp_size, stream, mr));
 
   return make_strings_column(input.size(),
                              std::move(offsets),
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index d8a3055772e..3cfbf79a8f3 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -19,7 +19,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/pad_impl.cuh>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/padding.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -47,8 +47,9 @@ struct base_fn {
   column_device_view const d_column;
   size_type const width;
   size_type const fill_char_size;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   base_fn(column_device_view const& d_column, size_type width, size_type fill_char_size)
     : d_column(d_column), width(width), fill_char_size(fill_char_size)
@@ -58,7 +59,7 @@ struct base_fn {
   __device__ void operator()(size_type idx) const
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -67,7 +68,7 @@ struct base_fn {
     if (d_chars) {
       derived.pad(d_str, d_chars + d_offsets[idx]);
     } else {
-      d_offsets[idx] = compute_padded_size(d_str, width, fill_char_size);
+      d_sizes[idx] = compute_padded_size(d_str, width, fill_char_size);
     }
   };
 };
@@ -116,13 +117,13 @@ std::unique_ptr<column> pad(strings_column_view const& input,
   auto [offsets_column, chars] = [&] {
     if (side == side_type::LEFT) {
       auto fn = pad_fn<side_type::LEFT>{*d_strings, width, fill_char_size, d_fill_char};
-      return make_strings_children(fn, input.size(), stream, mr);
+      return experimental::make_strings_children(fn, input.size(), stream, mr);
     } else if (side == side_type::RIGHT) {
       auto fn = pad_fn<side_type::RIGHT>{*d_strings, width, fill_char_size, d_fill_char};
-      return make_strings_children(fn, input.size(), stream, mr);
+      return experimental::make_strings_children(fn, input.size(), stream, mr);
     }
     auto fn = pad_fn<side_type::BOTH>{*d_strings, width, fill_char_size, d_fill_char};
-    return make_strings_children(fn, input.size(), stream, mr);
+    return experimental::make_strings_children(fn, input.size(), stream, mr);
   }();
 
   return make_strings_column(input.size(),
@@ -153,7 +154,7 @@ std::unique_ptr<column> zfill(strings_column_view const& input,
 
   auto d_strings = column_device_view::create(input.parent(), stream);
   auto [offsets_column, chars] =
-    make_strings_children(zfill_fn{*d_strings, width}, input.size(), stream, mr);
+    experimental::make_strings_children(zfill_fn{*d_strings, width}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
                              std::move(offsets_column),

From 2eeacb9f5f22a56458b644a93b8cbeacd4844472 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 30 Apr 2024 14:55:40 -1000
Subject: [PATCH 139/272] Make ColumnBase.__cuda_array_interface__ opt out
 instead of opt in (#15622)

Column types that support CAI already have custom `NotImplementedError`s, and since the implementation is the same for datetime and numeric columns, moving their implementation to `ColumnBase`

Should help address timedelta support in https://github.com/rapidsai/cudf/pull/15615 cc @brandon-b-miller

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15622
---
 python/cudf/cudf/core/column/column.py        | 26 ++++++++++---
 python/cudf/cudf/core/column/datetime.py      | 27 +-------------
 python/cudf/cudf/core/column/decimal.py       | 12 +++---
 python/cudf/cudf/core/column/numerical.py     | 37 +------------------
 python/cudf/cudf/core/column/string.py        |  7 ++++
 .../cudf/tests/test_cuda_array_interface.py   | 19 ++++++++--
 6 files changed, 53 insertions(+), 75 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 7e48552742c..ba2dab2c2e1 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1101,11 +1101,27 @@ def __arrow_array__(self, type=None):
         )
 
     @property
-    def __cuda_array_interface__(self):
-        raise NotImplementedError(
-            f"dtype {self.dtype} is not yet supported via "
-            "`__cuda_array_interface__`"
-        )
+    def __cuda_array_interface__(self) -> abc.Mapping[str, Any]:
+        output = {
+            "shape": (len(self),),
+            "strides": (self.dtype.itemsize,),
+            "typestr": self.dtype.str,
+            "data": (self.data_ptr, False),
+            "version": 1,
+        }
+
+        if self.nullable and self.has_nulls():
+            # Create a simple Python object that exposes the
+            # `__cuda_array_interface__` attribute here since we need to modify
+            # some of the attributes from the numba device array
+            output["mask"] = cuda_array_interface_wrapper(
+                ptr=self.mask_ptr,
+                size=len(self),
+                owner=self.mask,
+                readonly=True,
+                typestr="<t1",
+            )
+        return output
 
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         return _array_ufunc(self, ufunc, method, inputs, kwargs)
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index b84c1dc7ccd..981ef738458 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -7,7 +7,7 @@
 import locale
 import re
 from locale import nl_langinfo
-from typing import Any, Mapping, Optional, Sequence, cast
+from typing import Any, Optional, Sequence, cast
 
 import numpy as np
 import pandas as pd
@@ -25,7 +25,7 @@
 )
 from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype
 from cudf.core._compat import PANDAS_GE_220
-from cudf.core.buffer import Buffer, cuda_array_interface_wrapper
+from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
 from cudf.utils.dtypes import _get_base_dtype
@@ -399,29 +399,6 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
 
         return NotImplemented
 
-    @property
-    def __cuda_array_interface__(self) -> Mapping[str, Any]:
-        output = {
-            "shape": (len(self),),
-            "strides": (self.dtype.itemsize,),
-            "typestr": self.dtype.str,
-            "data": (self.data_ptr, False),
-            "version": 1,
-        }
-
-        if self.nullable and self.has_nulls():
-            # Create a simple Python object that exposes the
-            # `__cuda_array_interface__` attribute here since we need to modify
-            # some of the attributes from the numba device array
-            output["mask"] = cuda_array_interface_wrapper(
-                ptr=self.mask_ptr,
-                size=len(self),
-                owner=self.mask,
-                readonly=True,
-                typestr="<t1",
-            )
-        return output
-
     def as_datetime_column(
         self, dtype: Dtype, format: str | None = None
     ) -> DatetimeColumn:
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index b83a6ded416..3a0f6649e21 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -38,6 +38,12 @@ class DecimalBaseColumn(NumericalBaseColumn):
     dtype: DecimalDtype
     _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS
 
+    @property
+    def __cuda_array_interface__(self):
+        raise NotImplementedError(
+            "Decimals are not yet supported via `__cuda_array_interface__`"
+        )
+
     def as_decimal_column(
         self,
         dtype: Dtype,
@@ -342,12 +348,6 @@ def to_arrow(self):
             buffers=[mask_buf, data_buf],
         )
 
-    @property
-    def __cuda_array_interface__(self):
-        raise NotImplementedError(
-            "Decimals are not yet supported via `__cuda_array_interface__`"
-        )
-
     def _with_type_metadata(
         self: "cudf.core.column.Decimal64Column", dtype: Dtype
     ) -> "cudf.core.column.Decimal64Column":
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index f42c87de3fd..4c211a173b1 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -3,16 +3,7 @@
 from __future__ import annotations
 
 import functools
-from typing import (
-    Any,
-    Callable,
-    Mapping,
-    Optional,
-    Sequence,
-    Tuple,
-    Union,
-    cast,
-)
+from typing import Any, Callable, Optional, Sequence, Tuple, Union, cast
 
 import cupy as cp
 import numpy as np
@@ -37,7 +28,7 @@
     is_integer_dtype,
     is_scalar,
 )
-from cudf.core.buffer import Buffer, cuda_array_interface_wrapper
+from cudf.core.buffer import Buffer
 from cudf.core.column import (
     ColumnBase,
     as_column,
@@ -194,30 +185,6 @@ def __setitem__(self, key: Any, value: Any):
         if out:
             self._mimic_inplace(out, inplace=True)
 
-    @property
-    def __cuda_array_interface__(self) -> Mapping[str, Any]:
-        output = {
-            "shape": (len(self),),
-            "strides": (self.dtype.itemsize,),
-            "typestr": self.dtype.str,
-            "data": (self.data_ptr, False),
-            "version": 1,
-        }
-
-        if self.nullable and self.has_nulls():
-            # Create a simple Python object that exposes the
-            # `__cuda_array_interface__` attribute here since we need to modify
-            # some of the attributes from the numba device array
-            output["mask"] = cuda_array_interface_wrapper(
-                ptr=self.mask_ptr,
-                size=len(self),
-                owner=self.mask,
-                readonly=True,
-                typestr="<t1",
-            )
-
-        return output
-
     def unary_operator(self, unaryop: Union[str, Callable]) -> ColumnBase:
         if callable(unaryop):
             return libcudf.transform.transform(self, unaryop)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 8143e7919a7..3e941d60079 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5600,6 +5600,13 @@ def data_array_view(
     ) -> cuda.devicearray.DeviceNDArray:
         raise ValueError("Cannot get an array view of a StringColumn")
 
+    @property
+    def __cuda_array_interface__(self):
+        raise NotImplementedError(
+            f"dtype {self.dtype} is not yet supported via "
+            "`__cuda_array_interface__`"
+        )
+
     def to_arrow(self) -> pa.Array:
         """Convert to PyArrow Array
 
diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py
index 213c6c2c1f9..f98c3ad0475 100644
--- a/python/cudf/cudf/tests/test_cuda_array_interface.py
+++ b/python/cudf/cudf/tests/test_cuda_array_interface.py
@@ -11,7 +11,12 @@
 
 import cudf
 from cudf.core.buffer.spill_manager import get_global_manager
-from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq
+from cudf.testing._utils import (
+    DATETIME_TYPES,
+    NUMERIC_TYPES,
+    TIMEDELTA_TYPES,
+    assert_eq,
+)
 
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES)
@@ -42,7 +47,9 @@ def test_cuda_array_interface_interop_in(dtype, module):
         assert_eq(pd_data, gdf["test"])
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES + ["str"])
+@pytest.mark.parametrize(
+    "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["str"]
+)
 @pytest.mark.parametrize("module", ["cupy", "numba"])
 def test_cuda_array_interface_interop_out(dtype, module):
     expectation = does_not_raise()
@@ -73,7 +80,9 @@ def to_host_function(x):
         assert_eq(expect, got)
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES)
+@pytest.mark.parametrize(
+    "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES
+)
 @pytest.mark.parametrize("module", ["cupy", "numba"])
 def test_cuda_array_interface_interop_out_masked(dtype, module):
     expectation = does_not_raise()
@@ -104,7 +113,9 @@ def to_host_function(x):
         module_data = module_constructor(cudf_data)  # noqa: F841
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES)
+@pytest.mark.parametrize(
+    "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES
+)
 @pytest.mark.parametrize("nulls", ["all", "some", "bools", "none"])
 @pytest.mark.parametrize("mask_type", ["bits", "bools"])
 def test_cuda_array_interface_as_column(dtype, nulls, mask_type):

From f5c777826d759b0541569426e6099a3ef7a13049 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 1 May 2024 10:03:03 -0400
Subject: [PATCH 140/272] Large strings support for cudf::gather (#15621)

Replaces `make_offsets_child_column` with strings specific version in `cudf::strings::detail::gather` function.
Fixes issue found here: https://github.com/rapidsai/cudf/issues/13733#issuecomment-2079656314

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/15621
---
 cpp/include/cudf/strings/detail/gather.cuh | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 94bce6bddd5..fcd74bebfe8 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -19,23 +19,19 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
-#include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
-#include <thrust/advance.h>
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
-#include <thrust/functional.h>
-#include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
 namespace cudf {
@@ -226,7 +222,7 @@ rmm::device_uvector<char> gather_chars(StringIterator strings_begin,
                                        MapIterator map_begin,
                                        MapIterator map_end,
                                        cudf::detail::input_offsetalator const offsets,
-                                       size_type chars_bytes,
+                                       int64_t chars_bytes,
                                        rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
@@ -239,9 +235,9 @@ rmm::device_uvector<char> gather_chars(StringIterator strings_begin,
   constexpr int warps_per_threadblock = 4;
   // String parallel strategy will be used if average string length is above this threshold.
   // Otherwise, char parallel strategy will be used.
-  constexpr size_type string_parallel_threshold = 32;
+  constexpr int64_t string_parallel_threshold = 32;
 
-  size_type average_string_length = chars_bytes / output_count;
+  int64_t const average_string_length = chars_bytes / output_count;
 
   if (average_string_length > string_parallel_threshold) {
     constexpr int max_threadblocks = 65536;
@@ -302,7 +298,7 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
     strings.is_empty() ? make_empty_column(type_id::INT32)->view() : strings.offsets(),
     strings.offset());
 
-  auto offsets_itr = thrust::make_transform_iterator(
+  auto sizes_itr = thrust::make_transform_iterator(
     begin,
     cuda::proclaim_return_type<size_type>(
       [d_strings = *d_strings, d_in_offsets] __device__(size_type idx) {
@@ -310,8 +306,8 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
         if (not d_strings.is_valid(idx)) { return 0; }
         return static_cast<size_type>(d_in_offsets[idx + 1] - d_in_offsets[idx]);
       }));
-  auto [out_offsets_column, total_bytes] =
-    cudf::detail::make_offsets_child_column(offsets_itr, offsets_itr + output_count, stream, mr);
+  auto [out_offsets_column, total_bytes] = cudf::strings::detail::make_offsets_child_column(
+    sizes_itr, sizes_itr + output_count, stream, mr);
 
   // build chars column
   auto const offsets_view =

From 4aabf51df1441a77107fb146b182c82b1ed9c611 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 1 May 2024 10:03:31 -0400
Subject: [PATCH 141/272] Use experimental make_strings_children for json/csv
 writers (#15599)

Updates the JSON and CSV writer functions to use the new experimental make_strings_children.
Also included is an update to the JSON_BENCH benchmark for get_json_object.

Reference https://github.com/rapidsai/cudf/issues/15579

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15599
---
 cpp/benchmarks/json/json.cu   |  9 +++++----
 cpp/src/io/csv/writer_impl.cu | 11 ++++++-----
 cpp/src/io/json/write_json.cu | 13 +++++++------
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/cpp/benchmarks/json/json.cu b/cpp/benchmarks/json/json.cu
index a54d7d48dc4..c65db187f42 100644
--- a/cpp/benchmarks/json/json.cu
+++ b/cpp/benchmarks/json/json.cu
@@ -22,7 +22,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/json/json.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -77,8 +77,9 @@ struct json_benchmark_row_builder {
   cudf::column_device_view const d_book_pct;           // Book percentage
   cudf::column_device_view const d_misc_order;         // Misc-Store order
   cudf::column_device_view const d_store_order;        // Books-Bicycles order
-  int32_t* d_offsets{};
+  cudf::size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
   thrust::minstd_rand rng{5236};
   thrust::uniform_int_distribution<int> dist{};
 
@@ -155,7 +156,7 @@ struct json_benchmark_row_builder {
       output_str += Misc;
     }
     output_str += brace2;
-    if (!output_str.ptr) d_offsets[idx] = output_str.bytes;
+    if (!output_str.ptr) { d_sizes[idx] = output_str.bytes; }
   }
 };
 
@@ -177,7 +178,7 @@ auto build_json_string_column(int desired_bytes, int num_rows)
   auto d_store_order = cudf::column_device_view::create(float_2bool_columns->get_column(2));
   json_benchmark_row_builder jb{
     desired_bytes, num_rows, {*d_books, *d_bicycles}, *d_book_pct, *d_misc_order, *d_store_order};
-  auto [offsets, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets, chars] = cudf::strings::detail::experimental::make_strings_children(
     jb, num_rows, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   return cudf::make_strings_column(num_rows, std::move(offsets), chars.release(), 0, {});
 }
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 335ce77e3e3..58a74654405 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -33,7 +33,7 @@
 #include <cudf/strings/detail/combine.hpp>
 #include <cudf/strings/detail/converters.hpp>
 #include <cudf/strings/detail/replace.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
@@ -75,8 +75,9 @@ namespace {
 struct escape_strings_fn {
   column_device_view const d_column;
   string_view const d_delimiter;  // check for column delimiter
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void write_char(char_utf8 chr, char*& d_buffer, size_type& bytes)
   {
@@ -89,7 +90,7 @@ struct escape_strings_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -115,7 +116,7 @@ struct escape_strings_fn {
     }
     if (quote_row) write_char(quote, d_buffer, bytes);
 
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -182,7 +183,7 @@ struct column_to_strings_fn {
     auto d_column = column_device_view::create(column_v, stream_);
     escape_strings_fn fn{*d_column, delimiter.value(stream_)};
     auto [offsets_column, chars] =
-      cudf::strings::detail::make_strings_children(fn, column_v.size(), stream_, mr_);
+      cudf::strings::detail::experimental::make_strings_children(fn, column_v.size(), stream_, mr_);
 
     return make_strings_column(column_v.size(),
                                std::move(offsets_column),
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index 596b3381eaf..cac7149dabe 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -36,7 +36,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/detail/combine.hpp>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table.hpp>
@@ -78,8 +78,9 @@ namespace {
 struct escape_strings_fn {
   column_device_view const d_column;
   bool const append_colon{false};
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void write_char(char_utf8 chr, char*& d_buffer, size_type& bytes)
   {
@@ -123,7 +124,7 @@ struct escape_strings_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -163,15 +164,15 @@ struct escape_strings_fn {
     constexpr char_utf8 const colon = ':';  // append colon
     if (append_colon) write_char(colon, d_buffer, bytes);
 
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 
   std::unique_ptr<column> get_escaped_strings(column_view const& column_v,
                                               rmm::cuda_stream_view stream,
                                               rmm::device_async_resource_ref mr)
   {
-    auto [offsets_column, chars] =
-      cudf::strings::detail::make_strings_children(*this, column_v.size(), stream, mr);
+    auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
+      *this, column_v.size(), stream, mr);
 
     return make_strings_column(column_v.size(),
                                std::move(offsets_column),

From fe4b92cfa61a324b417f12760341f40e5db452eb Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 1 May 2024 14:33:38 -0400
Subject: [PATCH 142/272] Use experimental make_strings_children in nvtext APIs
 (#15595)

Updates nvtext replace, ngram, normalize, and detokenize functions to replace the existing calls to `make_strings_children` with the new experimental `make_strings_children` which supports building large strings.

Reference https://github.com/rapidsai/cudf/issues/15579

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15595
---
 cpp/src/text/detokenize.cu      | 19 ++++++++++---------
 cpp/src/text/generate_ngrams.cu | 20 +++++++++++---------
 cpp/src/text/normalize.cu       | 20 +++++++++++---------
 cpp/src/text/replace.cu         | 18 ++++++++++--------
 4 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index 63fe3113697..2efeeee0ee9 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -20,7 +20,7 @@
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sorting.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -48,12 +48,13 @@ namespace {
  * the same row. The `d_separator` is appended between each token.
  */
 struct detokenizer_fn {
-  cudf::column_device_view const d_strings;  // these are the tokens
-  cudf::size_type const* d_row_map;          // indices sorted by output row
-  cudf::size_type const* d_token_offsets;    // to each input token array
-  cudf::string_view const d_separator;       // append after each token
-  cudf::size_type* d_offsets{};              // offsets to output buffer d_chars
-  char* d_chars{};                           // output buffer for characters
+  cudf::column_device_view const d_strings;    // these are the tokens
+  cudf::size_type const* d_row_map;            // indices sorted by output row
+  cudf::size_type const* d_token_offsets;      // to each input token array
+  cudf::string_view const d_separator;         // append after each token
+  cudf::size_type* d_sizes{};                  // output sizes
+  char* d_chars{};                             // output buffer for characters
+  cudf::detail::input_offsetalator d_offsets;  // for addressing output row data in d_chars
 
   __device__ void operator()(cudf::size_type idx)
   {
@@ -75,7 +76,7 @@ struct detokenizer_fn {
         nbytes += d_separator.size_bytes();
       }
     }
-    if (!d_chars) { d_offsets[idx] = (nbytes > 0) ? (nbytes - d_separator.size_bytes()) : 0; }
+    if (!d_chars) { d_sizes[idx] = (nbytes > 0) ? (nbytes - d_separator.size_bytes()) : 0; }
   }
 };
 
@@ -157,7 +158,7 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string
 
   cudf::string_view const d_separator(separator.data(), separator.size());
 
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     detokenizer_fn{*strings_column, d_row_map, tokens_offsets.data(), d_separator},
     output_count,
     stream,
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index d9fcd7dfd05..fdd165a54bc 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -57,8 +57,9 @@ struct ngram_generator_fn {
   cudf::column_device_view const d_strings;
   cudf::size_type ngrams;
   cudf::string_view const d_separator;
-  cudf::size_type* d_offsets{};
+  cudf::size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Build ngram for each string.
@@ -81,7 +82,7 @@ struct ngram_generator_fn {
       bytes += d_separator.size_bytes();
       if (out_ptr) out_ptr = cudf::strings::detail::copy_string(out_ptr, d_separator);
     }
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -141,7 +142,7 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
   // compute the number of strings of ngrams
   auto const ngrams_count = strings_count - ngrams + 1;
 
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     ngram_generator_fn{d_strings, ngrams, d_separator}, ngrams_count, stream, mr);
 
   // make the output strings column from the offsets and chars column
@@ -175,8 +176,9 @@ struct character_ngram_generator_fn {
   cudf::column_device_view const d_strings;
   cudf::size_type ngrams;
   cudf::size_type const* d_ngram_offsets{};
-  cudf::size_type* d_offsets{};
+  cudf::size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(cudf::size_type idx)
   {
@@ -186,8 +188,8 @@ struct character_ngram_generator_fn {
     auto itr                = d_str.begin();
     auto const ngram_offset = d_ngram_offsets[idx];
     auto const ngram_count  = d_ngram_offsets[idx + 1] - ngram_offset;
-    auto d_sizes            = d_offsets + ngram_offset;
-    auto out_ptr            = d_chars ? d_chars + *d_sizes : nullptr;
+    auto d_output_sizes     = d_sizes + ngram_offset;
+    auto out_ptr            = d_chars ? d_chars + d_offsets[ngram_offset] : nullptr;
     for (cudf::size_type n = 0; n < ngram_count; ++n, ++itr) {
       auto const begin = itr.byte_offset();
       auto const end   = (itr + ngrams).byte_offset();
@@ -195,7 +197,7 @@ struct character_ngram_generator_fn {
         out_ptr =
           cudf::strings::detail::copy_and_increment(out_ptr, d_str.data() + begin, (end - begin));
       } else {
-        *d_sizes++ = end - begin;
+        *d_output_sizes++ = end - begin;
       }
     }
   }
@@ -233,7 +235,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
                "Insufficient number of characters in each string to generate ngrams");
 
   character_ngram_generator_fn generator{*d_strings, ngrams, d_offsets};
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     generator, strings_count, total_ngrams, stream, mr);
 
   auto output = cudf::make_strings_column(
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index e5e72d3a33e..2f97eb1ce74 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -26,7 +26,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -59,13 +59,14 @@ namespace {
  */
 struct normalize_spaces_fn {
   cudf::column_device_view const d_strings;  // strings to normalize
-  cudf::size_type* d_offsets{};              // offsets into d_chars
+  cudf::size_type* d_sizes{};                // size of each output row
   char* d_chars{};                           // output buffer for characters
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(cudf::size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     cudf::string_view const single_space(" ", 1);
@@ -93,7 +94,7 @@ struct normalize_spaces_fn {
       nbytes += token.size_bytes() + 1;  // token size plus a single space
     }
     // remove trailing space
-    if (!d_chars) d_offsets[idx] = (nbytes > 0) ? nbytes - 1 : 0;
+    if (!d_chars) { d_sizes[idx] = (nbytes > 0) ? nbytes - 1 : 0; }
   }
 };
 
@@ -109,8 +110,9 @@ struct codepoint_to_utf8_fn {
   cudf::column_device_view const d_strings;  // input strings
   uint32_t const* cp_data;                   // full code-point array
   int64_t const* d_cp_offsets{};             // offsets to each string's code-point array
-  cudf::size_type* d_offsets{};              // offsets for the output strings
+  cudf::size_type* d_sizes{};                // size of output string
   char* d_chars{};                           // buffer for the output strings column
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Return the number of bytes for the output string given its code-point array.
@@ -133,14 +135,14 @@ struct codepoint_to_utf8_fn {
   __device__ void operator()(cudf::size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const offset = d_cp_offsets[idx];
     auto const count  = d_cp_offsets[idx + 1] - offset;  // number of code-points
     auto str_cps      = cp_data + offset;                // code-points for this string
     if (!d_chars) {
-      d_offsets[idx] = compute_output_size(str_cps, count);
+      d_sizes[idx] = compute_output_size(str_cps, count);
       return;
     }
     // convert each code-point to 1-4 UTF-8 encoded bytes
@@ -183,7 +185,7 @@ std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const&
   auto d_strings = cudf::column_device_view::create(strings.parent(), stream);
 
   // build offsets and children using the normalize_space_fn
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     normalize_spaces_fn{*d_strings}, strings.size(), stream, mr);
 
   return cudf::make_strings_column(strings.size(),
@@ -225,7 +227,7 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   auto d_strings = cudf::column_device_view::create(strings.parent(), stream);
 
   // build offsets and children using the codepoint_to_utf8_fn
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     codepoint_to_utf8_fn{*d_strings, cp_chars, cp_offsets}, strings.size(), stream, mr);
 
   return cudf::make_strings_column(strings.size(),
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index f61fa544e73..f95b53a3ac8 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -21,7 +21,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -48,8 +48,9 @@ using replace_result = thrust::pair<bool, cudf::string_view>;
 struct base_token_replacer_fn {
   cudf::column_device_view const d_strings;  ///< strings to tokenize
   cudf::string_view const d_delimiter;       ///< delimiter characters for tokenizing
-  cudf::size_type* d_offsets{};              ///< for locating output string in d_chars
+  cudf::size_type* d_sizes{};                ///< for output string size
   char* d_chars{};                           ///< output buffer
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Tokenizes each string and calls the provided `replacer` function
@@ -63,7 +64,7 @@ struct base_token_replacer_fn {
   __device__ void process_string(cudf::size_type idx, ReplaceFn replacer)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -95,10 +96,11 @@ struct base_token_replacer_fn {
     }
 
     // copy the remainder of the string's bytes to the output buffer
-    if (out_ptr)
+    if (out_ptr) {
       memcpy(out_ptr, in_ptr + last_pos, d_str.size_bytes() - last_pos);
-    else
-      d_offsets[idx] = nbytes;
+    } else {
+      d_sizes[idx] = nbytes;
+    }
   }
 };
 
@@ -230,7 +232,7 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
 
   // this utility calls replacer to build the offsets and chars columns
   auto [offsets_column, chars] =
-    cudf::strings::detail::make_strings_children(replacer, strings_count, stream, mr);
+    cudf::strings::detail::experimental::make_strings_children(replacer, strings_count, stream, mr);
 
   // return new strings column
   return cudf::make_strings_column(strings_count,
@@ -263,7 +265,7 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& str
 
   // this utility calls filterer to build the offsets and chars columns
   auto [offsets_column, chars] =
-    cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);
+    cudf::strings::detail::experimental::make_strings_children(filterer, strings_count, stream, mr);
 
   // return new strings column
   return cudf::make_strings_column(strings_count,

From 67d427deb3cf18d1139b76aecc1e6a3e9d5253f3 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 1 May 2024 16:03:28 -0500
Subject: [PATCH 143/272] Fix categorical-accessor support and testing in
 dask-cudf (#15591)

Related to https://github.com/rapidsai/cudf/issues/15027

Adds a minor tokenization fix, and adjusts testing for categorical-accessor support.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - Matthew Roeschke (https://github.com/mroeschke)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15591
---
 python/cudf/cudf/core/indexed_frame.py         |  7 ++++++-
 .../dask_cudf/dask_cudf/io/tests/test_json.py  |  4 ++--
 .../dask_cudf/dask_cudf/io/tests/test_orc.py   |  4 ++--
 .../dask_cudf/io/tests/test_parquet.py         |  2 +-
 .../dask_cudf/dask_cudf/io/tests/test_text.py  |  4 ++--
 .../dask_cudf/dask_cudf/tests/test_accessor.py | 18 ++++++++++++++----
 python/dask_cudf/dask_cudf/tests/utils.py      | 11 +++++------
 7 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 48e80d8162f..bec97bd3290 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -6308,7 +6308,12 @@ def __dask_tokenize__(self):
 
         return [
             type(self),
-            normalize_token(self._dtypes),
+            str(self._dtypes),
+            *[
+                normalize_token(cat.categories)
+                for cat in self._dtypes.values()
+                if cat == "category"
+            ],
             normalize_token(self.index),
             normalize_token(self.hash_values().values_host),
         ]
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py
index a09dfbff188..f8e5be0a417 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_json.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py
@@ -12,8 +12,8 @@
 import dask_cudf
 from dask_cudf.tests.utils import skip_dask_expr
 
-# No dask-expr support for dask_expr<1.0.6
-pytestmark = skip_dask_expr(lt_version="1.0.6")
+# No dask-expr support for dask<2024.4.0
+pytestmark = skip_dask_expr(lt_version="2024.4.0")
 
 
 def test_read_json_backend_dispatch(tmp_path):
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
index 7be6c712511..457e5546bd9 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
@@ -14,8 +14,8 @@
 import dask_cudf
 from dask_cudf.tests.utils import skip_dask_expr
 
-# No dask-expr support for dask_expr<1.0.6
-pytestmark = skip_dask_expr(lt_version="1.0.6")
+# No dask-expr support for dask<2024.4.0
+pytestmark = skip_dask_expr(lt_version="2024.4.0")
 
 cur_dir = os.path.dirname(__file__)
 sample_orc = os.path.join(cur_dir, "data/orc/sample.orc")
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 8ca27df8fec..6f4737db5be 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -536,7 +536,7 @@ def test_check_file_size(tmpdir):
         dask_cudf.io.read_parquet(fn, check_file_size=1).compute()
 
 
-@xfail_dask_expr("HivePartitioning cannot be hashed", lt_version="1.0")
+@xfail_dask_expr("HivePartitioning cannot be hashed", lt_version="2024.3.0")
 def test_null_partition(tmpdir):
     import pyarrow as pa
     from pyarrow.dataset import HivePartitioning
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py
index e3a9d380857..8912b7d5da6 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_text.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py
@@ -11,8 +11,8 @@
 import dask_cudf
 from dask_cudf.tests.utils import skip_dask_expr
 
-# No dask-expr support for dask_expr<1.0.6
-pytestmark = skip_dask_expr(lt_version="1.0.6")
+# No dask-expr support for dask<2024.4.0
+pytestmark = skip_dask_expr(lt_version="2024.4.0")
 
 cur_dir = os.path.dirname(__file__)
 text_file = os.path.join(cur_dir, "data/text/sample.pgn")
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index ebb8e4be187..ae17b89832a 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -111,7 +111,7 @@ def test_categorical_accessor_initialization2(data):
         dsr.cat
 
 
-@xfail_dask_expr("TODO: Unexplained dask-expr failure")
+@xfail_dask_expr(lt_version="2024.5.0")
 @pytest.mark.parametrize("data", [data_cat_1()])
 def test_categorical_basic(data):
     cat = data.copy()
@@ -203,7 +203,6 @@ def test_categorical_compare_unordered(data):
         dsr < dsr
 
 
-@xfail_dask_expr("TODO: Unexplained dask-expr failure")
 @pytest.mark.parametrize("data", [data_cat_3()])
 def test_categorical_compare_ordered(data):
     cat1 = data[0].copy()
@@ -274,7 +273,6 @@ def test_categorical_categories():
     )
 
 
-@xfail_dask_expr("TODO: Unexplained dask-expr failure")
 def test_categorical_as_known():
     df = dask_cudf.from_cudf(DataFrame({"col_1": [0, 1, 2, 3]}), npartitions=2)
     df["col_1"] = df["col_1"].astype("category")
@@ -283,7 +281,19 @@ def test_categorical_as_known():
     pdf = dd.from_pandas(pd.DataFrame({"col_1": [0, 1, 2, 3]}), npartitions=2)
     pdf["col_1"] = pdf["col_1"].astype("category")
     expected = pdf["col_1"].cat.as_known()
-    dd.assert_eq(expected, actual)
+
+    # Note: Categories may be ordered differently in
+    # cudf and pandas. Therefore, we need to compare
+    # the global set of categories (before and after
+    # calling `compute`), then we need to check that
+    # the initial order of rows was preserved.
+    assert set(expected.cat.categories) == set(
+        actual.cat.categories.values_host
+    )
+    assert set(expected.compute().cat.categories) == set(
+        actual.compute().cat.categories.values_host
+    )
+    dd.assert_eq(expected, actual.astype(expected.dtype))
 
 
 def test_str_slice():
diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py
index 1ca1758736b..c7dedbb6b4a 100644
--- a/python/dask_cudf/dask_cudf/tests/utils.py
+++ b/python/dask_cudf/dask_cudf/tests/utils.py
@@ -5,6 +5,7 @@
 import pytest
 from packaging.version import Version
 
+import dask
 import dask.dataframe as dd
 
 import cudf
@@ -12,11 +13,9 @@
 from dask_cudf.expr import QUERY_PLANNING_ON
 
 if QUERY_PLANNING_ON:
-    import dask_expr
-
-    DASK_EXPR_VERSION = Version(dask_expr.__version__)
+    DASK_VERSION = Version(dask.__version__)
 else:
-    DASK_EXPR_VERSION = None
+    DASK_VERSION = None
 
 
 def _make_random_frame(nelem, npartitions=2, include_na=False):
@@ -37,7 +36,7 @@ def _make_random_frame(nelem, npartitions=2, include_na=False):
 
 def skip_dask_expr(reason=_default_reason, lt_version=None):
     if lt_version is not None:
-        skip = QUERY_PLANNING_ON and DASK_EXPR_VERSION < Version(lt_version)
+        skip = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version)
     else:
         skip = QUERY_PLANNING_ON
     return pytest.mark.skipif(skip, reason=reason)
@@ -45,7 +44,7 @@ def skip_dask_expr(reason=_default_reason, lt_version=None):
 
 def xfail_dask_expr(reason=_default_reason, lt_version=None):
     if lt_version is not None:
-        xfail = QUERY_PLANNING_ON and DASK_EXPR_VERSION < Version(lt_version)
+        xfail = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version)
     else:
         xfail = QUERY_PLANNING_ON
     return pytest.mark.xfail(xfail, reason=reason)

From 7458a6ecbf474e10a4a64f10833d71253f42af7b Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 1 May 2024 17:56:12 -0500
Subject: [PATCH 144/272] Add "collect" aggregation support to dask-cudf
 (#15593)

This PR ~(along with it's upstream dependency)~ enables `"collect"` aggregations in dask-cudf when query-planning is enabled. It also adds an clearer error message for `as_index` usage (which is not supported in dask-dataframe, but *was* supported in legacy dask-cudf)

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15593
---
 .../dask_cudf/dask_cudf/expr/_collection.py   | 24 +++++++++
 python/dask_cudf/dask_cudf/expr/_groupby.py   | 54 +++++++++++++++++++
 .../dask_cudf/dask_cudf/tests/test_groupby.py | 39 ++++++++------
 3 files changed, 101 insertions(+), 16 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py
index 605a81f0fcd..d50dfb24256 100644
--- a/python/dask_cudf/dask_cudf/expr/_collection.py
+++ b/python/dask_cudf/dask_cudf/expr/_collection.py
@@ -18,6 +18,15 @@
 
 import cudf
 
+_LEGACY_WORKAROUND = (
+    "To enable the 'legacy' dask-cudf API, set the "
+    "global 'dataframe.query-planning' config to "
+    "`False` before dask is imported. This can also "
+    "be done by setting an environment variable: "
+    "`DASK_DATAFRAME__QUERY_PLANNING=False` "
+)
+
+
 ##
 ## Custom collection classes
 ##
@@ -88,6 +97,21 @@ def groupby(
                 f"`by` must be a column name or list of columns, got {by}."
             )
 
+        if "as_index" in kwargs:
+            msg = (
+                "The `as_index` argument is now deprecated. All groupby "
+                "results will be consistent with `as_index=True`."
+            )
+
+            if kwargs.pop("as_index") is not True:
+                raise NotImplementedError(
+                    f"{msg} Please reset the index after aggregating, or "
+                    "use the legacy API if `as_index=False` is required.\n"
+                    f"{_LEGACY_WORKAROUND}"
+                )
+            else:
+                warnings.warn(msg, FutureWarning)
+
         return GroupBy(
             self,
             by,
diff --git a/python/dask_cudf/dask_cudf/expr/_groupby.py b/python/dask_cudf/dask_cudf/expr/_groupby.py
index 7f275151f75..116893891e3 100644
--- a/python/dask_cudf/dask_cudf/expr/_groupby.py
+++ b/python/dask_cudf/dask_cudf/expr/_groupby.py
@@ -3,13 +3,55 @@
 from dask_expr._groupby import (
     GroupBy as DXGroupBy,
     SeriesGroupBy as DXSeriesGroupBy,
+    SingleAggregation,
 )
 from dask_expr._util import is_scalar
 
+from dask.dataframe.groupby import Aggregation
+
 ##
 ## Custom groupby classes
 ##
 
+
+class Collect(SingleAggregation):
+    @staticmethod
+    def groupby_chunk(arg):
+        return arg.agg("collect")
+
+    @staticmethod
+    def groupby_aggregate(arg):
+        gb = arg.agg("collect")
+        if gb.ndim > 1:
+            for col in gb.columns:
+                gb[col] = gb[col].list.concat()
+            return gb
+        else:
+            return gb.list.concat()
+
+
+collect_aggregation = Aggregation(
+    name="collect",
+    chunk=Collect.groupby_chunk,
+    agg=Collect.groupby_aggregate,
+)
+
+
+def _translate_arg(arg):
+    # Helper function to translate args so that
+    # they can be processed correctly by upstream
+    # dask & dask-expr. Right now, the only necessary
+    # translation is "collect" aggregations.
+    if isinstance(arg, dict):
+        return {k: _translate_arg(v) for k, v in arg.items()}
+    elif isinstance(arg, list):
+        return [_translate_arg(x) for x in arg]
+    elif arg in ("collect", "list", list):
+        return collect_aggregation
+    else:
+        return arg
+
+
 # TODO: These classes are mostly a work-around for missing
 # `observed=False` support.
 # See: https://github.com/rapidsai/cudf/issues/15173
@@ -41,8 +83,20 @@ def __getitem__(self, key):
         )
         return g
 
+    def collect(self, **kwargs):
+        return self._single_agg(Collect, **kwargs)
+
+    def aggregate(self, arg, **kwargs):
+        return super().aggregate(_translate_arg(arg), **kwargs)
+
 
 class SeriesGroupBy(DXSeriesGroupBy):
     def __init__(self, *args, observed=None, **kwargs):
         observed = observed if observed is not None else True
         super().__init__(*args, observed=observed, **kwargs)
+
+    def collect(self, **kwargs):
+        return self._single_agg(Collect, **kwargs)
+
+    def aggregate(self, arg, **kwargs):
+        return super().aggregate(_translate_arg(arg), **kwargs)
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index 1e22dd95475..67fa045d3d0 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -14,16 +14,6 @@
 from dask_cudf.groupby import OPTIMIZED_AGGS, _aggs_optimized
 from dask_cudf.tests.utils import QUERY_PLANNING_ON, xfail_dask_expr
 
-# XFAIL "collect" tests for now
-agg_params = [agg for agg in OPTIMIZED_AGGS if agg != "collect"]
-if QUERY_PLANNING_ON:
-    agg_params.append(
-        # TODO: "collect" not supported with dask-expr yet
-        pytest.param("collect", marks=pytest.mark.xfail)
-    )
-else:
-    agg_params.append("collect")
-
 
 def assert_cudf_groupby_layers(ddf):
     for prefix in ("cudf-aggregate-chunk", "cudf-aggregate-agg"):
@@ -57,7 +47,7 @@ def pdf(request):
     return pdf
 
 
-@pytest.mark.parametrize("aggregation", agg_params)
+@pytest.mark.parametrize("aggregation", OPTIMIZED_AGGS)
 @pytest.mark.parametrize("series", [False, True])
 def test_groupby_basic(series, aggregation, pdf):
     gdf = cudf.DataFrame.from_pandas(pdf)
@@ -110,7 +100,7 @@ def test_groupby_cumulative(aggregation, pdf, series):
     dd.assert_eq(a, b)
 
 
-@pytest.mark.parametrize("aggregation", agg_params)
+@pytest.mark.parametrize("aggregation", OPTIMIZED_AGGS)
 @pytest.mark.parametrize(
     "func",
     [
@@ -579,8 +569,16 @@ def test_groupby_categorical_key():
     dd.assert_eq(expect, got)
 
 
-@xfail_dask_expr("as_index not supported in dask-expr")
-@pytest.mark.parametrize("as_index", [True, False])
+@pytest.mark.parametrize(
+    "as_index",
+    [
+        True,
+        pytest.param(
+            False,
+            marks=xfail_dask_expr("as_index not supported in dask-expr"),
+        ),
+    ],
+)
 @pytest.mark.parametrize("split_out", ["use_dask_default", 1, 2])
 @pytest.mark.parametrize("split_every", [False, 4])
 @pytest.mark.parametrize("npartitions", [1, 10])
@@ -603,10 +601,19 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
     if split_out == "use_dask_default":
         split_kwargs.pop("split_out")
 
+    # Avoid using as_index when query-planning is enabled
+    if QUERY_PLANNING_ON:
+        with pytest.warns(FutureWarning, match="argument is now deprecated"):
+            # Should warn when `as_index` is used
+            ddf.groupby(["name", "a"], sort=False, as_index=as_index)
+        maybe_as_index = {"as_index": as_index} if as_index is False else {}
+    else:
+        maybe_as_index = {"as_index": as_index}
+
     # Check `sort=True` behavior
     if split_out == 1:
         gf = (
-            ddf.groupby(["name", "a"], sort=True, as_index=as_index)
+            ddf.groupby(["name", "a"], sort=True, **maybe_as_index)
             .aggregate(
                 agg_dict,
                 **split_kwargs,
@@ -628,7 +635,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
             )
 
     # Full check (`sort=False`)
-    gr = ddf.groupby(["name", "a"], sort=False, as_index=as_index).aggregate(
+    gr = ddf.groupby(["name", "a"], sort=False, **maybe_as_index).aggregate(
         agg_dict,
         **split_kwargs,
     )

From e58838b6cc820fc89f1f67eb9117a3ee6ddeaa47 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 2 May 2024 11:59:55 -0400
Subject: [PATCH 145/272] Large strings support for cudf::clamp (#15533)

Replaces call to `make_strings_children` utility to use the gather-based `make_strings_column` function which is already optimized for long strings (and large strings).

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/15533
---
 cpp/src/replace/clamp.cu | 38 ++++++++++++++++----------------------
 1 file changed, 16 insertions(+), 22 deletions(-)

diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index fe5a9cfbd71..31ffc76a4a5 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -28,7 +28,7 @@
 #include <cudf/replace.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
@@ -52,26 +52,22 @@ namespace {
 
 template <typename OptionalScalarIterator, typename ReplaceScalarIterator>
 struct clamp_strings_fn {
+  using string_index_pair = cudf::strings::detail::string_index_pair;
   column_device_view const d_strings;
   OptionalScalarIterator lo_itr;
   ReplaceScalarIterator lo_replace_itr;
   OptionalScalarIterator hi_itr;
   ReplaceScalarIterator hi_replace_itr;
-  size_type* d_offsets{};
-  char* d_chars{};
 
-  __device__ void operator()(size_type idx) const
+  __device__ string_index_pair operator()(size_type idx) const
   {
-    if (d_strings.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
-      return;
-    }
+    if (d_strings.is_null(idx)) { return string_index_pair{nullptr, 0}; }
+
     auto const element      = d_strings.element<string_view>(idx);
     auto const d_lo         = (*lo_itr).value_or(element);
     auto const d_hi         = (*hi_itr).value_or(element);
     auto const d_lo_replace = *(*lo_replace_itr);
     auto const d_hi_replace = *(*hi_replace_itr);
-    auto d_output           = d_chars ? d_chars + d_offsets[idx] : nullptr;
 
     auto d_str = [d_lo, d_lo_replace, d_hi, d_hi_replace, element] {
       if (element < d_lo) { return d_lo_replace; }
@@ -79,11 +75,9 @@ struct clamp_strings_fn {
       return element;
     }();
 
-    if (d_output) {
-      cudf::strings::detail::copy_string(d_output, d_str);
-    } else {
-      d_offsets[idx] = d_str.size_bytes();
-    }
+    // ensures an empty string is not converted to a null row
+    return !d_str.empty() ? string_index_pair{d_str.data(), d_str.size_bytes()}
+                          : string_index_pair{"", 0};
   }
 };
 
@@ -101,14 +95,14 @@ std::unique_ptr<cudf::column> clamp_string_column(strings_column_view const& inp
 
   auto fn = clamp_strings_fn<OptionalScalarIterator, ReplaceScalarIterator>{
     d_input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr};
-  auto [offsets_column, chars] =
-    cudf::strings::detail::make_strings_children(fn, input.size(), stream, mr);
-
-  return make_strings_column(input.size(),
-                             std::move(offsets_column),
-                             chars.release(),
-                             input.null_count(),
-                             std::move(cudf::detail::copy_bitmask(input.parent(), stream, mr)));
+  rmm::device_uvector<cudf::strings::detail::string_index_pair> indices(input.size(), stream);
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::counting_iterator<size_type>(0),
+                    thrust::counting_iterator<size_type>(input.size()),
+                    indices.begin(),
+                    fn);
+
+  return cudf::strings::detail::make_strings_column(indices.begin(), indices.end(), stream, mr);
 }
 
 template <typename T, typename OptionalScalarIterator, typename ReplaceScalarIterator>

From 68828708497252f9c9dae617b9f29ae7de448309 Mon Sep 17 00:00:00 2001
From: Ray Bell <rayjohnbell0@gmail.com>
Date: Thu, 2 May 2024 13:03:26 -0400
Subject: [PATCH 146/272] Doc: interleave columns pandas compat (#15383)

Add a `pandas_compat` note to `DataFrame.interleave_columns`

Authors:
  - Ray Bell (https://github.com/raybellwaves)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15383
---
 python/cudf/cudf/core/dataframe.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 45bb66d5d4b..1e6ae861679 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -7556,6 +7556,12 @@ def interleave_columns(self):
         Returns
         -------
         The interleaved columns as a single column
+
+        .. pandas-compat::
+            **DataFrame.interleave_columns**
+
+            This method does not exist in pandas but it can be run
+            as ``pd.Series(np.vstack(df.to_numpy()).reshape((-1,)))``.
         """
         if ("category" == self.dtypes).any():
             raise ValueError(

From 4494991f73ed373bfb7a300859e5f234f94d8131 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Thu, 2 May 2024 12:25:00 -0500
Subject: [PATCH 147/272] Construct `pylibcudf` columns from objects supporting
 `__cuda_array_interface__` (#15615)

This PR allows zero copy construction of `pylibcudf` columns from device arrays via the `gpumemoryview` class. cc @mroeschke

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15615
---
 python/cudf/cudf/_lib/pylibcudf/column.pyx    | 107 ++++++++++++++++++
 python/cudf/cudf/core/buffer/buffer.py        |  36 +-----
 .../test_column_from_device.py                |  51 +++++++++
 3 files changed, 162 insertions(+), 32 deletions(-)
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_column_from_device.py

diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
index 2565e92d5c9..b9e5e48226d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -16,6 +16,10 @@ from .scalar cimport Scalar
 from .types cimport DataType, type_id
 from .utils cimport int_to_bitmask_ptr, int_to_void_ptr
 
+import functools
+
+import numpy as np
+
 
 cdef class Column:
     """A container of nullable device data as a column of elements.
@@ -223,6 +227,51 @@ cdef class Column:
             c_result = move(make_column_from_scalar(dereference(c_scalar), size))
         return Column.from_libcudf(move(c_result))
 
+    @staticmethod
+    def from_cuda_array_interface_obj(object obj):
+        """Create a Column from an object with a CUDA array interface.
+
+        Parameters
+        ----------
+        obj : object
+            The object with the CUDA array interface to create a column from.
+
+        Returns
+        -------
+        Column
+            A Column containing the data from the CUDA array interface.
+
+        Notes
+        -----
+        Data is not copied when creating the column. The caller is
+        responsible for ensuring the data is not mutated unexpectedly while the
+        column is in use.
+        """
+        data = gpumemoryview(obj)
+        iface = data.__cuda_array_interface__()
+        if iface.get('mask') is not None:
+            raise ValueError("mask not yet supported.")
+
+        typestr = iface['typestr'][1:]
+        if not is_c_contiguous(
+            iface['shape'],
+            iface['strides'],
+            np.dtype(typestr).itemsize
+        ):
+            raise ValueError("Data must be C-contiguous")
+
+        data_type = _datatype_from_dtype_desc(typestr)
+        size = iface['shape'][0]
+        return Column(
+            data_type,
+            size,
+            data,
+            None,
+            0,
+            0,
+            []
+        )
+
     cpdef DataType type(self):
         """The type of data in the column."""
         return self._data_type
@@ -296,3 +345,61 @@ cdef class ListColumnView:
     cpdef offsets(self):
         """The offsets column of the underlying list column."""
         return self._column.child(1)
+
+
+@functools.cache
+def _datatype_from_dtype_desc(desc):
+    mapping = {
+        'u1': type_id.UINT8,
+        'u2': type_id.UINT16,
+        'u4': type_id.UINT32,
+        'u8': type_id.UINT64,
+        'i1': type_id.INT8,
+        'i2': type_id.INT16,
+        'i4': type_id.INT32,
+        'i8': type_id.INT64,
+        'f4': type_id.FLOAT32,
+        'f8': type_id.FLOAT64,
+        'b1': type_id.BOOL8,
+        'M8[s]': type_id.TIMESTAMP_SECONDS,
+        'M8[ms]': type_id.TIMESTAMP_MILLISECONDS,
+        'M8[us]': type_id.TIMESTAMP_MICROSECONDS,
+        'M8[ns]': type_id.TIMESTAMP_NANOSECONDS,
+        'm8[s]': type_id.DURATION_SECONDS,
+        'm8[ms]': type_id.DURATION_MILLISECONDS,
+        'm8[us]': type_id.DURATION_MICROSECONDS,
+        'm8[ns]': type_id.DURATION_NANOSECONDS,
+    }
+    if desc not in mapping:
+        raise ValueError(f"Unsupported dtype: {desc}")
+    return DataType(mapping[desc])
+
+
+def is_c_contiguous(
+    shape: Sequence[int], strides: Sequence[int], itemsize: int
+) -> bool:
+    """Determine if shape and strides are C-contiguous
+
+    Parameters
+    ----------
+    shape : Sequence[int]
+        Number of elements in each dimension.
+    strides : Sequence[int]
+        The stride of each dimension in bytes.
+    itemsize : int
+        Size of an element in bytes.
+
+    Return
+    ------
+    bool
+        The boolean answer.
+    """
+
+    if any(dim == 0 for dim in shape):
+        return True
+    cumulative_stride = itemsize
+    for dim, stride in zip(reversed(shape), reversed(strides)):
+        if dim > 1 and stride != cumulative_stride:
+            return False
+        cumulative_stride *= dim
+    return True
diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index b2aba4f978b..5c2d77033b8 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -6,7 +6,7 @@
 import pickle
 import weakref
 from types import SimpleNamespace
-from typing import Any, Dict, Literal, Mapping, Optional, Sequence, Tuple
+from typing import Any, Dict, Literal, Mapping, Optional, Tuple
 
 import numpy
 from typing_extensions import Self
@@ -480,36 +480,6 @@ def __str__(self) -> str:
         )
 
 
-def is_c_contiguous(
-    shape: Sequence[int], strides: Sequence[int], itemsize: int
-) -> bool:
-    """Determine if shape and strides are C-contiguous
-
-    Parameters
-    ----------
-    shape : Sequence[int]
-        Number of elements in each dimension.
-    strides : Sequence[int]
-        The stride of each dimension in bytes.
-    itemsize : int
-        Size of an element in bytes.
-
-    Return
-    ------
-    bool
-        The boolean answer.
-    """
-
-    if any(dim == 0 for dim in shape):
-        return True
-    cumulative_stride = itemsize
-    for dim, stride in zip(reversed(shape), reversed(strides)):
-        if dim > 1 and stride != cumulative_stride:
-            return False
-        cumulative_stride *= dim
-    return True
-
-
 def get_ptr_and_size(array_interface: Mapping) -> Tuple[int, int]:
     """Retrieve the pointer and size from an array interface.
 
@@ -531,7 +501,9 @@ def get_ptr_and_size(array_interface: Mapping) -> Tuple[int, int]:
     shape = array_interface["shape"] or (1,)
     strides = array_interface["strides"]
     itemsize = cudf.dtype(array_interface["typestr"]).itemsize
-    if strides is None or is_c_contiguous(shape, strides, itemsize):
+    if strides is None or cudf._lib.pylibcudf.column.is_c_contiguous(
+        shape, strides, itemsize
+    ):
         nelem = math.prod(shape)
         ptr = array_interface["data"][0] or 0
         return ptr, nelem * itemsize
diff --git a/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py b/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py
new file mode 100644
index 00000000000..764720d9de1
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import cudf
+from cudf._lib import pylibcudf as plc
+
+VALID_TYPES = [
+    pa.int8(),
+    pa.int16(),
+    pa.int32(),
+    pa.int64(),
+    pa.uint8(),
+    pa.uint16(),
+    pa.uint32(),
+    pa.uint64(),
+    pa.float32(),
+    pa.float64(),
+    pa.bool_(),
+    pa.timestamp("s"),
+    pa.timestamp("ms"),
+    pa.timestamp("us"),
+    pa.timestamp("ns"),
+    pa.duration("s"),
+    pa.duration("ms"),
+    pa.duration("us"),
+    pa.duration("ns"),
+]
+
+
+@pytest.fixture(params=VALID_TYPES, ids=repr)
+def valid_type(request):
+    return request.param
+
+
+@pytest.fixture
+def valid_column(valid_type):
+    if valid_type == pa.bool_():
+        return pa.array([True, False, True], type=valid_type)
+    return pa.array([1, 2, 3], type=valid_type)
+
+
+def test_from_cuda_array_interface(valid_column):
+    col = plc.column.Column.from_cuda_array_interface_obj(
+        cudf.Series(valid_column)
+    )
+    expect = valid_column
+
+    assert_column_eq(col, expect)

From 500cb29ce8f3043f0227a8852bad98c3f6c0dab2 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 2 May 2024 12:52:48 -0500
Subject: [PATCH 148/272] Check column type equality, handling nested types
 correctly. (#14531)

Addresses most of #14527. See also #14494.

This PR expands the use of `cudf::column_types_equal(lhs, rhs)` and adds new methods `cudf::column_scalar_types_equal`, `cudf::scalar_types_equal`, and `cudf::all_column_types_equal`.

These type check functions are now employed throughout the code base instead of raw checks like `a.type() == b.type()` because those do not correctly handle nested types.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Lawrence Mitchell (https://github.com/wence-)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/14531
---
 .../developer_guide/DEVELOPER_GUIDE.md        |  15 +-
 cpp/include/cudf/detail/scatter.cuh           |   7 +-
 cpp/include/cudf/lists/detail/scatter.cuh     |   2 +-
 cpp/include/cudf/table/table_view.hpp         |  11 +-
 cpp/include/cudf/utilities/type_checks.hpp    | 106 +++++++++-
 cpp/src/copying/concatenate.cu                |  11 +-
 cpp/src/copying/copy.cu                       |  18 +-
 cpp/src/copying/copy_range.cu                 |  10 +-
 cpp/src/copying/scatter.cu                    |  39 ++--
 cpp/src/copying/shift.cu                      |   3 +-
 cpp/src/dictionary/add_keys.cu                |   5 +-
 cpp/src/dictionary/detail/concatenate.cu      |  20 +-
 cpp/src/dictionary/remove_keys.cu             |   6 +-
 cpp/src/dictionary/replace.cu                 |  10 +-
 cpp/src/dictionary/search.cu                  |  18 +-
 cpp/src/dictionary/set_keys.cu                |   6 +-
 cpp/src/filling/fill.cu                       |  11 +-
 cpp/src/filling/sequence.cu                   |   5 +-
 cpp/src/groupby/groupby.cu                    |  16 +-
 cpp/src/interop/dlpack.cpp                    |   7 +-
 cpp/src/join/hash_join.cu                     |  11 +-
 cpp/src/labeling/label_bins.cu                |   7 +-
 cpp/src/lists/combine/concatenate_rows.cu     |  12 +-
 cpp/src/lists/contains.cu                     |   3 +-
 cpp/src/lists/sequences.cu                    |  11 +-
 cpp/src/lists/set_operations.cu               |   2 +-
 cpp/src/merge/merge.cu                        |   1 +
 cpp/src/reductions/reductions.cpp             |   7 +-
 cpp/src/reductions/segmented/reductions.cpp   |   6 +-
 cpp/src/replace/clamp.cu                      |  17 +-
 cpp/src/replace/nulls.cu                      |  12 +-
 cpp/src/replace/replace.cu                    |   8 +-
 cpp/src/rolling/detail/lead_lag_nested.cuh    |   7 +-
 cpp/src/search/contains_scalar.cu             |  12 +-
 cpp/src/search/contains_table.cu              |   1 +
 cpp/src/strings/slice.cu                      |  14 +-
 cpp/src/table/table_view.cpp                  |  23 +-
 cpp/src/transform/one_hot_encode.cu           |   5 +-
 cpp/src/utilities/type_checks.cpp             | 126 +++++++++--
 cpp/tests/copying/concatenate_tests.cpp       |  13 +-
 cpp/tests/copying/copy_range_tests.cpp        |   2 +-
 cpp/tests/copying/copy_tests.cpp              |   2 +-
 cpp/tests/copying/get_value_tests.cpp         |  11 +-
 cpp/tests/dictionary/add_keys_test.cpp        |   3 +-
 cpp/tests/dictionary/remove_keys_test.cpp     |   3 +-
 cpp/tests/dictionary/scatter_test.cpp         |   2 +-
 cpp/tests/dictionary/search_test.cpp          |   4 +-
 cpp/tests/dictionary/set_keys_test.cpp        |   3 +-
 cpp/tests/filling/fill_tests.cpp              |   4 +-
 cpp/tests/filling/sequence_tests.cpp          |   8 +-
 cpp/tests/groupby/shift_tests.cpp             |   6 +-
 cpp/tests/interop/dlpack_test.cpp             |   3 +-
 cpp/tests/io/parquet_writer_test.cpp          |   4 +-
 cpp/tests/labeling/label_bins_tests.cpp       |   7 +-
 .../lists/combine/concatenate_rows_tests.cpp  |   5 +-
 cpp/tests/lists/sequences_tests.cpp           |   7 +-
 cpp/tests/replace/clamp_test.cpp              |  11 +-
 cpp/tests/replace/replace_nulls_tests.cpp     |  10 +-
 cpp/tests/replace/replace_tests.cpp           |   3 +-
 cpp/tests/transform/one_hot_encode_tests.cpp  |   5 +-
 cpp/tests/utilities/column_utilities.cu       |  10 +-
 .../utilities_tests/type_check_tests.cpp      | 197 ++++++++++--------
 62 files changed, 615 insertions(+), 319 deletions(-)

diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index 23b129fdf4b..05f8e4585cc 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -943,13 +943,14 @@ Use the `CUDF_EXPECTS` macro to enforce runtime conditions necessary for correct
 Example usage:
 
 ```c++
-CUDF_EXPECTS(lhs.type() == rhs.type(), "Column type mismatch");
+CUDF_EXPECTS(cudf::have_same_types(lhs, rhs), "Type mismatch", cudf::data_type_error);
 ```
 
 The first argument is the conditional expression expected to resolve to `true` under normal
-conditions. If the conditional evaluates to `false`, then an error has occurred and an instance of
-`cudf::logic_error` is thrown. The second argument to `CUDF_EXPECTS` is a short description of the
-error that has occurred and is used for the exception's `what()` message.
+conditions. The second argument to `CUDF_EXPECTS` is a short description of the error that has
+occurred and is used for the exception's `what()` message. If the conditional evaluates to
+`false`, then an error has occurred and an instance of the exception class in the third argument
+(or the default, `cudf::logic_error`) is thrown.
 
 There are times where a particular code path, if reached, should indicate an error no matter what.
 For example, often the `default` case of a `switch` statement represents an invalid alternative.
@@ -1048,6 +1049,12 @@ types such as numeric types and timestamps/durations, adding support for nested
 Enabling an algorithm differently for different types uses either template specialization or SFINAE,
 as discussed in [Specializing Type-Dispatched Code Paths](#specializing-type-dispatched-code-paths).
 
+## Comparing Data Types
+
+When comparing the data types of two columns or scalars, do not directly compare
+`a.type() == b.type()`. Nested types such as lists of structs of integers will not be handled
+properly if only the top level type is compared. Instead, use the `cudf::have_same_types` function.
+
 # Type Dispatcher
 
 libcudf stores data (for columns and scalars) "type erased" in `void*` device memory. This
diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
index 7eb661f7833..80bc87731ca 100644
--- a/cpp/include/cudf/detail/scatter.cuh
+++ b/cpp/include/cudf/detail/scatter.cuh
@@ -29,7 +29,9 @@
 #include <cudf/strings/detail/scatter.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -213,8 +215,9 @@ struct column_scatterer_impl<dictionary32> {
     // check the keys match
     dictionary_column_view const source(source_in);
     dictionary_column_view const target(target_in);
-    CUDF_EXPECTS(source.keys().type() == target.keys().type(),
-                 "scatter dictionary keys must be the same type");
+    CUDF_EXPECTS(cudf::have_same_types(source.keys(), target.keys()),
+                 "scatter dictionary keys must be the same type",
+                 cudf::data_type_error);
 
     // first combine keys so both dictionaries have the same set
     auto target_matched    = dictionary::detail::add_keys(target, source.keys(), stream, mr);
diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index d0d5b1ad823..c550ad5b94f 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -101,7 +101,7 @@ std::unique_ptr<column> scatter_impl(rmm::device_uvector<unbound_list_view> cons
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(column_types_equal(source, target), "Mismatched column types.");
+  CUDF_EXPECTS(have_same_types(source, target), "Mismatched column types.");
 
   auto const child_column_type = lists_column_view(target).child().type();
 
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index 4f3b23747e6..ad12b1eef4e 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -339,15 +339,6 @@ bool has_nested_nullable_columns(table_view const& input);
  */
 std::vector<column_view> get_nullable_columns(table_view const& table);
 
-/**
- * @brief Checks if two `table_view`s have columns of same types
- *
- * @param lhs left-side table_view operand
- * @param rhs right-side table_view operand
- * @return boolean comparison result
- */
-bool have_same_types(table_view const& lhs, table_view const& rhs);
-
 /**
  * @brief Copy column_views from a table_view into another table_view according to
  * a column indices map.
diff --git a/cpp/include/cudf/utilities/type_checks.hpp b/cpp/include/cudf/utilities/type_checks.hpp
index b925fc8ae92..fd3b0581c11 100644
--- a/cpp/include/cudf/utilities/type_checks.hpp
+++ b/cpp/include/cudf/utilities/type_checks.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,16 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <algorithm>
 
 namespace cudf {
 
 /**
- * @brief Compares the type of two `column_view`s
+ * @brief Compare the types of two `column_view`s
+ *
+ * @deprecated Since 24.06. Use cudf::have_same_types instead.
  *
  * This function returns true if the type of `lhs` equals that of `rhs`.
  * - For fixed point types, the scale is compared.
@@ -34,10 +39,11 @@ namespace cudf {
  * @param rhs The second `column_view` to compare
  * @return true if column types match
  */
-bool column_types_equal(column_view const& lhs, column_view const& rhs);
+[[deprecated]] bool column_types_equal(column_view const& lhs, column_view const& rhs);
 
 /**
  * @brief Compare the type IDs of two `column_view`s
+ *
  * This function returns true if the type of `lhs` equals that of `rhs`.
  * - For fixed point types, the scale is ignored.
  *
@@ -47,4 +53,98 @@ bool column_types_equal(column_view const& lhs, column_view const& rhs);
  */
 bool column_types_equivalent(column_view const& lhs, column_view const& rhs);
 
+/**
+ * @brief Compares the type of two `column_view`s
+ *
+ * This function returns true if the type of `lhs` equals that of `rhs`.
+ * - For fixed point types, the scale is compared.
+ * - For dictionary types, the type of the keys are compared if both are
+ *   non-empty columns.
+ * - For lists types, the type of child columns are compared recursively.
+ * - For struct types, the type of each field are compared in order.
+ * - For all other types, the `id` of `data_type` is compared.
+ *
+ * @param lhs The first `column_view` to compare
+ * @param rhs The second `column_view` to compare
+ * @return true if types match
+ */
+bool have_same_types(column_view const& lhs, column_view const& rhs);
+
+/**
+ * @brief Compare the types of a `column_view` and a `scalar`
+ *
+ * This function returns true if the type of `lhs` equals that of `rhs`.
+ * - For fixed point types, the scale is compared.
+ * - For dictionary column types, the type of the keys is compared to the
+ *   scalar type.
+ * - For lists types, the types of child columns are compared recursively.
+ * - For struct types, the types of each field are compared in order.
+ * - For all other types, the `id` of `data_type` is compared.
+ *
+ * @param lhs The `column_view` to compare
+ * @param rhs The `scalar` to compare
+ * @return true if types match
+ */
+bool have_same_types(column_view const& lhs, scalar const& rhs);
+
+/**
+ * @brief Compare the types of a `scalar` and a `column_view`
+ *
+ * This function returns true if the type of `lhs` equals that of `rhs`.
+ * - For fixed point types, the scale is compared.
+ * - For dictionary column types, the type of the keys is compared to the
+ *   scalar type.
+ * - For lists types, the types of child columns are compared recursively.
+ * - For struct types, the types of each field are compared in order.
+ * - For all other types, the `id` of `data_type` is compared.
+ *
+ * @param lhs The `scalar` to compare
+ * @param rhs The `column_view` to compare
+ * @return true if types match
+ */
+bool have_same_types(scalar const& lhs, column_view const& rhs);
+
+/**
+ * @brief Compare the types of two `scalar`s
+ *
+ * This function returns true if the type of `lhs` equals that of `rhs`.
+ * - For fixed point types, the scale is compared.
+ * - For lists types, the types of child columns are compared recursively.
+ * - For struct types, the types of each field are compared in order.
+ * - For all other types, the `id` of `data_type` is compared.
+ *
+ * @param lhs The first `scalar` to compare
+ * @param rhs The second `scalar` to compare
+ * @return true if types match
+ */
+bool have_same_types(scalar const& lhs, scalar const& rhs);
+
+/**
+ * @brief Checks if two `table_view`s have columns of same types
+ *
+ * @param lhs left-side table_view operand
+ * @param rhs right-side table_view operand
+ * @return boolean comparison result
+ */
+bool have_same_types(table_view const& lhs, table_view const& rhs);
+
+/**
+ * @brief Compare the types of a range of `column_view` or `scalar` objects
+ *
+ * This function returns true if all objects in the range have the same type, in the sense of
+ * cudf::have_same_types.
+ *
+ * @tparam ForwardIt Forward iterator
+ * @param first The first iterator
+ * @param last The last iterator
+ * @return true if all types match
+ */
+template <typename ForwardIt>
+inline bool all_have_same_types(ForwardIt first, ForwardIt last)
+{
+  return first == last || std::all_of(std::next(first), last, [want = *first](auto const& c) {
+           return cudf::have_same_types(want, c);
+         });
+}
+
 }  // namespace cudf
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 7c57be8e7c0..b1136a9eeb3 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -30,6 +30,8 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -461,12 +463,9 @@ void traverse_children::operator()<cudf::list_view>(host_span<column_view const>
  */
 void bounds_and_type_check(host_span<column_view const> cols, rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS(std::all_of(cols.begin(),
-                           cols.end(),
-                           [expected_type = cols.front().type()](auto const& c) {
-                             return c.type() == expected_type;
-                           }),
-               "Type mismatch in columns to concatenate.");
+  CUDF_EXPECTS(cudf::all_have_same_types(cols.begin(), cols.end()),
+               "Type mismatch in columns to concatenate.",
+               cudf::data_type_error);
 
   // total size of all concatenated rows
   size_t const total_row_count =
diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
index 92fb2e61741..e86a1f8d6f1 100644
--- a/cpp/src/copying/copy.cu
+++ b/cpp/src/copying/copy.cu
@@ -26,6 +26,7 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -362,9 +363,10 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
   CUDF_EXPECTS(boolean_mask.size() == lhs.size(),
                "Boolean mask column must be the same size as lhs and rhs columns",
                std::invalid_argument);
-  CUDF_EXPECTS(lhs.size() == rhs.size(), "Both columns must be of the size", std::invalid_argument);
   CUDF_EXPECTS(
-    lhs.type() == rhs.type(), "Both inputs must be of the same type", cudf::data_type_error);
+    lhs.size() == rhs.size(), "Both columns must be of the same size", std::invalid_argument);
+  CUDF_EXPECTS(
+    cudf::have_same_types(lhs, rhs), "Both inputs must be of the same type", cudf::data_type_error);
 
   return copy_if_else(lhs, rhs, lhs.has_nulls(), rhs.has_nulls(), boolean_mask, stream, mr);
 }
@@ -378,11 +380,8 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
   CUDF_EXPECTS(boolean_mask.size() == rhs.size(),
                "Boolean mask column must be the same size as rhs column",
                std::invalid_argument);
-
-  auto rhs_type =
-    cudf::is_dictionary(rhs.type()) ? cudf::dictionary_column_view(rhs).keys_type() : rhs.type();
   CUDF_EXPECTS(
-    lhs.type() == rhs_type, "Both inputs must be of the same type", cudf::data_type_error);
+    cudf::have_same_types(rhs, lhs), "Both inputs must be of the same type", cudf::data_type_error);
 
   return copy_if_else(lhs, rhs, !lhs.is_valid(stream), rhs.has_nulls(), boolean_mask, stream, mr);
 }
@@ -396,11 +395,8 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
   CUDF_EXPECTS(boolean_mask.size() == lhs.size(),
                "Boolean mask column must be the same size as lhs column",
                std::invalid_argument);
-
-  auto lhs_type =
-    cudf::is_dictionary(lhs.type()) ? cudf::dictionary_column_view(lhs).keys_type() : lhs.type();
   CUDF_EXPECTS(
-    lhs_type == rhs.type(), "Both inputs must be of the same type", cudf::data_type_error);
+    cudf::have_same_types(lhs, rhs), "Both inputs must be of the same type", cudf::data_type_error);
 
   return copy_if_else(lhs, rhs, lhs.has_nulls(), !rhs.is_valid(stream), boolean_mask, stream, mr);
 }
@@ -412,7 +408,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(
-    lhs.type() == rhs.type(), "Both inputs must be of the same type", cudf::data_type_error);
+    cudf::have_same_types(lhs, rhs), "Both inputs must be of the same type", cudf::data_type_error);
   return copy_if_else(
     lhs, rhs, !lhs.is_valid(stream), !rhs.is_valid(stream), boolean_mask, stream, mr);
 }
diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu
index d2ea7036952..dd18f99a3c8 100644
--- a/cpp/src/copying/copy_range.cu
+++ b/cpp/src/copying/copy_range.cu
@@ -32,6 +32,7 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
@@ -147,8 +148,9 @@ std::unique_ptr<cudf::column> out_of_place_copy_range_dispatch::operator()<cudf:
   // check the keys in the source and target
   cudf::dictionary_column_view const dict_source(source);
   cudf::dictionary_column_view const dict_target(target);
-  CUDF_EXPECTS(dict_source.keys().type() == dict_target.keys().type(),
-               "dictionary keys must be the same type");
+  CUDF_EXPECTS(cudf::have_same_types(dict_source.keys(), dict_target.keys()),
+               "dictionary keys must be the same type",
+               cudf::data_type_error);
 
   // combine keys so both dictionaries have the same set
   auto target_matched =
@@ -211,7 +213,7 @@ void copy_range_in_place(column_view const& source,
                  (target_begin <= target.size() - (source_end - source_begin)),
                "Range is out of bounds.",
                std::out_of_range);
-  CUDF_EXPECTS(target.type() == source.type(), "Data type mismatch.", cudf::data_type_error);
+  CUDF_EXPECTS(cudf::have_same_types(target, source), "Data type mismatch.", cudf::data_type_error);
   CUDF_EXPECTS(target.nullable() || not source.has_nulls(),
                "target should be nullable if source has null values.",
                std::invalid_argument);
@@ -239,7 +241,7 @@ std::unique_ptr<column> copy_range(column_view const& source,
                  (target_begin <= target.size() - (source_end - source_begin)),
                "Range is out of bounds.",
                std::out_of_range);
-  CUDF_EXPECTS(target.type() == source.type(), "Data type mismatch.", cudf::data_type_error);
+  CUDF_EXPECTS(cudf::have_same_types(target, source), "Data type mismatch.", cudf::data_type_error);
 
   return cudf::type_dispatcher<dispatch_storage_type>(
     target.type(),
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index cfcbe4724df..993ee074f14 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -32,6 +32,8 @@
 #include <cudf/structs/struct_view.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
@@ -112,7 +114,7 @@ struct column_scalar_scatterer_impl {
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr) const
   {
-    CUDF_EXPECTS(source.get().type() == target.type(),
+    CUDF_EXPECTS(cudf::have_same_types(target, source.get()),
                  "scalar and column types must match",
                  cudf::data_type_error);
 
@@ -145,7 +147,7 @@ struct column_scalar_scatterer_impl<string_view, MapIterator> {
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr) const
   {
-    CUDF_EXPECTS(source.get().type() == target.type(),
+    CUDF_EXPECTS(cudf::have_same_types(target, source.get()),
                  "scalar and column types must match",
                  cudf::data_type_error);
 
@@ -315,12 +317,7 @@ std::unique_ptr<table> scatter(table_view const& source,
   CUDF_EXPECTS(scatter_map.size() <= source.num_rows(),
                "Size of scatter map must be equal to or less than source rows",
                std::invalid_argument);
-  CUDF_EXPECTS(std::equal(source.begin(),
-                          source.end(),
-                          target.begin(),
-                          [](auto const& col1, auto const& col2) {
-                            return col1.type().id() == col2.type().id();
-                          }),
+  CUDF_EXPECTS(cudf::have_same_types(source, target),
                "Column types do not match between source and target",
                cudf::data_type_error);
   CUDF_EXPECTS(not scatter_map.has_nulls(), "Scatter map contains nulls", std::invalid_argument);
@@ -452,14 +449,9 @@ std::unique_ptr<table> boolean_mask_scatter(table_view const& input,
                "Mask must be of Boolean type",
                cudf::data_type_error);
   // Count valid pair of input and columns as per type at each column index i
-  CUDF_EXPECTS(
-    std::all_of(thrust::counting_iterator<size_type>(0),
-                thrust::counting_iterator<size_type>(target.num_columns()),
-                [&input, &target](auto index) {
-                  return ((input.column(index).type().id()) == (target.column(index).type().id()));
-                }),
-    "Type mismatch in input column and target column",
-    cudf::data_type_error);
+  CUDF_EXPECTS(cudf::have_same_types(input, target),
+               "Type mismatch in input column and target column",
+               cudf::data_type_error);
 
   if (target.num_rows() != 0) {
     std::vector<std::unique_ptr<column>> out_columns(target.num_columns());
@@ -496,14 +488,13 @@ std::unique_ptr<table> boolean_mask_scatter(
                cudf::data_type_error);
 
   // Count valid pair of input and columns as per type at each column/scalar index i
-  CUDF_EXPECTS(
-    std::all_of(thrust::counting_iterator<size_type>(0),
-                thrust::counting_iterator<size_type>(target.num_columns()),
-                [&input, &target](auto index) {
-                  return (input[index].get().type().id() == target.column(index).type().id());
-                }),
-    "Type mismatch in input scalar and target column",
-    cudf::data_type_error);
+  CUDF_EXPECTS(std::all_of(thrust::counting_iterator<size_type>(0),
+                           thrust::counting_iterator<size_type>(target.num_columns()),
+                           [&input, &target](auto index) {
+                             return cudf::have_same_types(target.column(index), input[index].get());
+                           }),
+               "Type mismatch in input scalar and target column",
+               cudf::data_type_error);
 
   if (target.num_rows() != 0) {
     std::vector<std::unique_ptr<column>> out_columns(target.num_columns());
diff --git a/cpp/src/copying/shift.cu b/cpp/src/copying/shift.cu
index bdc741887f7..91254f21170 100644
--- a/cpp/src/copying/shift.cu
+++ b/cpp/src/copying/shift.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -158,7 +159,7 @@ std::unique_ptr<column> shift(column_view const& input,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(input.type() == fill_value.type(),
+  CUDF_EXPECTS(cudf::have_same_types(input, fill_value),
                "shift requires each fill value type to match the corresponding column type.",
                cudf::data_type_error);
 
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index 5fd21ee0094..0ed9006f88b 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -29,6 +29,8 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -54,7 +56,8 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
 {
   CUDF_EXPECTS(!new_keys.has_nulls(), "Keys must not have nulls");
   auto old_keys = dictionary_column.keys();  // [a,b,c,d,f]
-  CUDF_EXPECTS(new_keys.type() == old_keys.type(), "Keys must be the same type");
+  CUDF_EXPECTS(
+    cudf::have_same_types(new_keys, old_keys), "Keys must be the same type", cudf::data_type_error);
   // first, concatenate the keys together
   // [a,b,c,d,f] + [d,b,e] = [a,b,c,d,f,d,b,e]
   auto combined_keys = cudf::detail::concatenate(
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index 62a6c816493..fdc3d9d0ecf 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -26,6 +26,8 @@
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -82,13 +84,13 @@ struct compute_children_offsets_fn {
   }
 
   /**
-   * @brief Return the first keys().type of the dictionary columns.
+   * @brief Return the first keys() of the dictionary columns.
    */
-  data_type get_keys_type()
+  column_view get_keys()
   {
     auto const view(*std::find_if(
       columns_ptrs.begin(), columns_ptrs.end(), [](auto pcv) { return pcv->size() > 0; }));
-    return dictionary_column_view(*view).keys().type();
+    return dictionary_column_view(*view).keys();
   }
 
   /**
@@ -214,14 +216,16 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
 
   // concatenate the keys (and check the keys match)
   compute_children_offsets_fn child_offsets_fn{columns};
-  auto keys_type = child_offsets_fn.get_keys_type();
+  auto expected_keys = child_offsets_fn.get_keys();
   std::vector<column_view> keys_views(columns.size());
-  std::transform(columns.begin(), columns.end(), keys_views.begin(), [keys_type](auto cv) {
+  std::transform(columns.begin(), columns.end(), keys_views.begin(), [expected_keys](auto cv) {
     auto dict_view = dictionary_column_view(cv);
     // empty column may not have keys so we create an empty column_view place-holder
-    if (dict_view.is_empty()) return column_view{keys_type, 0, nullptr, nullptr, 0};
+    if (dict_view.is_empty()) return column_view{expected_keys.type(), 0, nullptr, nullptr, 0};
     auto keys = dict_view.keys();
-    CUDF_EXPECTS(keys.type() == keys_type, "key types of all dictionary columns must match");
+    CUDF_EXPECTS(cudf::have_same_types(keys, expected_keys),
+                 "key types of all dictionary columns must match",
+                 cudf::data_type_error);
     return keys;
   });
   auto all_keys =
@@ -275,7 +279,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
 
   // now recompute the indices values for the new keys_column;
   // the keys offsets (pair.first) are for mapping to the input keys
-  auto indices_column = type_dispatcher(keys_type,
+  auto indices_column = type_dispatcher(expected_keys.type(),
                                         dispatch_compute_indices{},
                                         all_keys->view(),     // old keys
                                         all_indices->view(),  // old indices
diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu
index 718ca419289..35387efa56b 100644
--- a/cpp/src/dictionary/remove_keys.cu
+++ b/cpp/src/dictionary/remove_keys.cu
@@ -26,6 +26,8 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -155,7 +157,9 @@ std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_col
 {
   CUDF_EXPECTS(!keys_to_remove.has_nulls(), "keys_to_remove must not have nulls");
   auto const keys_view = dictionary_column.keys();
-  CUDF_EXPECTS(keys_view.type() == keys_to_remove.type(), "keys types must match");
+  CUDF_EXPECTS(cudf::have_same_types(keys_view, keys_to_remove),
+               "keys types must match",
+               cudf::data_type_error);
 
   // locate keys to remove by searching the keys column
   auto const matches = cudf::detail::contains(keys_to_remove, keys_view, stream, mr);
diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu
index bb6b08c243d..bc17dfd4bab 100644
--- a/cpp/src/dictionary/replace.cu
+++ b/cpp/src/dictionary/replace.cu
@@ -24,6 +24,8 @@
 #include <cudf/dictionary/detail/search.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
@@ -84,7 +86,9 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
 {
   if (input.is_empty()) { return cudf::empty_like(input.parent()); }
   if (!input.has_nulls()) { return std::make_unique<cudf::column>(input.parent(), stream, mr); }
-  CUDF_EXPECTS(input.keys().type() == replacement.keys().type(), "keys must match");
+  CUDF_EXPECTS(cudf::have_same_types(input.keys(), replacement.keys()),
+               "keys must match",
+               cudf::data_type_error);
   CUDF_EXPECTS(replacement.size() == input.size(), "column sizes must match");
 
   // first combine the keys so both input dictionaries have the same set
@@ -119,7 +123,9 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
   if (!input.has_nulls() || !replacement.is_valid(stream)) {
     return std::make_unique<cudf::column>(input.parent(), stream, mr);
   }
-  CUDF_EXPECTS(input.keys().type() == replacement.type(), "keys must match scalar type");
+  CUDF_EXPECTS(cudf::have_same_types(input.parent(), replacement),
+               "keys must match scalar type",
+               cudf::data_type_error);
 
   // first add the replacement to the keys so only the indices need to be processed
   auto input_matched = dictionary::detail::add_keys(
diff --git a/cpp/src/dictionary/search.cu b/cpp/src/dictionary/search.cu
index 680eadddba8..231619836f9 100644
--- a/cpp/src/dictionary/search.cu
+++ b/cpp/src/dictionary/search.cu
@@ -19,7 +19,9 @@
 #include <cudf/dictionary/detail/search.hpp>
 #include <cudf/dictionary/search.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -72,10 +74,12 @@ struct find_index_fn {
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr) const
   {
-    if (!key.is_valid(stream))
+    if (!key.is_valid(stream)) {
       return type_dispatcher(input.indices().type(), dispatch_scalar_index{}, 0, false, stream, mr);
-    CUDF_EXPECTS(input.keys().type() == key.type(),
-                 "search key type must match dictionary keys type");
+    }
+    CUDF_EXPECTS(cudf::have_same_types(input.parent(), key),
+                 "search key type must match dictionary keys type",
+                 cudf::data_type_error);
 
     using ScalarType = cudf::scalar_type_t<Element>;
     auto find_key    = static_cast<ScalarType const&>(key).value(stream);
@@ -114,10 +118,12 @@ struct find_insert_index_fn {
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr) const
   {
-    if (!key.is_valid(stream))
+    if (!key.is_valid(stream)) {
       return type_dispatcher(input.indices().type(), dispatch_scalar_index{}, 0, false, stream, mr);
-    CUDF_EXPECTS(input.keys().type() == key.type(),
-                 "search key type must match dictionary keys type");
+    }
+    CUDF_EXPECTS(cudf::have_same_types(input.parent(), key),
+                 "search key type must match dictionary keys type",
+                 cudf::data_type_error);
 
     using ScalarType = cudf::scalar_type_t<Element>;
     auto find_key    = static_cast<ScalarType const&>(key).value(stream);
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index b56eec9401a..08a33d40abe 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -29,6 +29,8 @@
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -116,7 +118,6 @@ struct dispatch_compute_indices {
 
 }  // namespace
 
-//
 std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& new_keys,
                                  rmm::cuda_stream_view stream,
@@ -124,7 +125,8 @@ std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column
 {
   CUDF_EXPECTS(!new_keys.has_nulls(), "keys parameter must not have nulls");
   auto keys = dictionary_column.keys();
-  CUDF_EXPECTS(keys.type() == new_keys.type(), "keys types must match");
+  CUDF_EXPECTS(
+    cudf::have_same_types(keys, new_keys), "keys types must match", cudf::data_type_error);
 
   // copy the keys -- use cudf::distinct to make sure there are no duplicates,
   // then sort the results.
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index c4d786bd73b..1fc9ed31c09 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -33,6 +33,7 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -110,7 +111,7 @@ struct out_of_place_fill_range_dispatch {
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr)
   {
-    CUDF_EXPECTS(input.type() == value.type(), "Data type mismatch.");
+    CUDF_EXPECTS(cudf::have_same_types(input, value), "Data type mismatch.", cudf::data_type_error);
     auto p_ret = std::make_unique<cudf::column>(input, stream, mr);
 
     if (end != begin) {  // otherwise no fill
@@ -137,7 +138,7 @@ std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf:
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(input.type() == value.type(), "Data type mismatch.");
+  CUDF_EXPECTS(cudf::have_same_types(input, value), "Data type mismatch.", cudf::data_type_error);
   using ScalarType = cudf::scalar_type_t<cudf::string_view>;
   auto p_scalar    = static_cast<ScalarType const*>(&value);
   return cudf::strings::detail::fill(
@@ -153,7 +154,8 @@ std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf:
 {
   if (input.is_empty()) return std::make_unique<cudf::column>(input, stream, mr);
   cudf::dictionary_column_view const target(input);
-  CUDF_EXPECTS(target.keys().type() == value.type(), "Data type mismatch.");
+  CUDF_EXPECTS(
+    cudf::have_same_types(target.parent(), value), "Data type mismatch.", cudf::data_type_error);
 
   // if the scalar is invalid, then just copy the column and fill the null mask
   if (!value.is_valid(stream)) {
@@ -219,7 +221,8 @@ void fill_in_place(mutable_column_view& destination,
                "Range is out of bounds.");
   CUDF_EXPECTS(destination.nullable() || value.is_valid(stream),
                "destination should be nullable or value should be non-null.");
-  CUDF_EXPECTS(destination.type() == value.type(), "Data type mismatch.");
+  CUDF_EXPECTS(
+    cudf::have_same_types(destination, value), "Data type mismatch.", cudf::data_type_error);
 
   if (end != begin) {  // otherwise no-op
     cudf::type_dispatcher(
diff --git a/cpp/src/filling/sequence.cu b/cpp/src/filling/sequence.cu
index f7067c3a91b..ee1745b8498 100644
--- a/cpp/src/filling/sequence.cu
+++ b/cpp/src/filling/sequence.cu
@@ -24,6 +24,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -128,7 +129,9 @@ std::unique_ptr<column> sequence(size_type size,
                                  rmm::cuda_stream_view stream,
                                  rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(init.type() == step.type(), "init and step must be of the same type.");
+  CUDF_EXPECTS(cudf::have_same_types(init, step),
+               "init and step must be of the same type.",
+               cudf::data_type_error);
   CUDF_EXPECTS(size >= 0, "size must be >= 0");
   CUDF_EXPECTS(is_numeric(init.type()), "Input scalar types must be numeric");
 
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 73cb4efd283..e43dfcb4d98 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -36,6 +36,7 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -312,12 +313,15 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::shift(
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(values.num_columns() == static_cast<size_type>(fill_values.size()),
                "Mismatch number of fill_values and columns.");
-  CUDF_EXPECTS(
-    std::all_of(thrust::make_counting_iterator(0),
-                thrust::make_counting_iterator(values.num_columns()),
-                [&](auto i) { return values.column(i).type() == fill_values[i].get().type(); }),
-    "values and fill_value should have the same type.");
-
+  CUDF_EXPECTS(std::equal(values.begin(),
+                          values.end(),
+                          fill_values.cbegin(),
+                          fill_values.cend(),
+                          [](auto const& col, auto const& scalar) {
+                            return cudf::have_same_types(col, scalar.get());
+                          }),
+               "values and fill_value should have the same type.",
+               cudf::data_type_error);
   auto stream = cudf::get_default_stream();
   std::vector<std::unique_ptr<column>> results;
   auto const& group_offsets = helper().group_offsets(stream);
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index 3109a36cbcf..78ddd7f5ad5 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -231,9 +232,9 @@ DLManagedTensor* to_dlpack(table_view const& input,
   DLDataType const dltype = data_type_to_DLDataType(type);
 
   // Ensure all columns are the same type
-  CUDF_EXPECTS(
-    std::all_of(input.begin(), input.end(), [type](auto const& col) { return col.type() == type; }),
-    "All columns required to have same data type");
+  CUDF_EXPECTS(cudf::all_have_same_types(input.begin(), input.end()),
+               "All columns required to have same data type",
+               cudf::data_type_error);
 
   // Ensure none of the columns have nulls
   CUDF_EXPECTS(
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index fbe16378e8c..b0184ff6a86 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -21,6 +21,8 @@
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/join.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
@@ -569,12 +571,9 @@ hash_join<Hasher>::compute_hash_join(cudf::table_view const& probe,
                      std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
-  CUDF_EXPECTS(std::equal(std::cbegin(_build),
-                          std::cend(_build),
-                          std::cbegin(probe),
-                          std::cend(probe),
-                          [](auto const& b, auto const& p) { return b.type() == p.type(); }),
-               "Mismatch in joining column data types");
+  CUDF_EXPECTS(cudf::have_same_types(_build, probe),
+               "Mismatch in joining column data types",
+               cudf::data_type_error);
 
   return probe_join_indices(probe, join, output_size, stream, mr);
 }
diff --git a/cpp/src/labeling/label_bins.cu b/cpp/src/labeling/label_bins.cu
index 1bfa7f39190..7ee1d540831 100644
--- a/cpp/src/labeling/label_bins.cu
+++ b/cpp/src/labeling/label_bins.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -208,8 +209,10 @@ std::unique_ptr<column> label_bins(column_view const& input,
                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE()
-  CUDF_EXPECTS((input.type() == left_edges.type()) && (input.type() == right_edges.type()),
-               "The input and edge columns must have the same types.");
+  CUDF_EXPECTS(
+    cudf::have_same_types(input, left_edges) && cudf::have_same_types(input, right_edges),
+    "The input and edge columns must have the same types.",
+    cudf::data_type_error);
   CUDF_EXPECTS(left_edges.size() == right_edges.size(),
                "The left and right edge columns must be of the same length.");
   CUDF_EXPECTS(!left_edges.has_nulls() && !right_edges.has_nulls(),
diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu
index 38d299763a1..bc1b48b11cd 100644
--- a/cpp/src/lists/combine/concatenate_rows.cu
+++ b/cpp/src/lists/combine/concatenate_rows.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/lists/combine.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -204,12 +205,11 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
     std::all_of(input.begin(),
                 input.end(),
                 [](column_view const& col) { return col.type().id() == cudf::type_id::LIST; }),
-    "All columns of the input table must be of lists column type.");
-  CUDF_EXPECTS(
-    std::all_of(std::next(input.begin()),
-                input.end(),
-                [a = *input.begin()](column_view const& b) { return column_types_equal(a, b); }),
-    "The types of entries in the input columns must be the same.");
+    "All columns of the input table must be of list column type.",
+    cudf::data_type_error);
+  CUDF_EXPECTS(cudf::all_have_same_types(input.begin(), input.end()),
+               "The types of entries in the input columns must be the same.",
+               cudf::data_type_error);
 
   auto const num_rows = input.num_rows();
   auto const num_cols = input.num_columns();
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index 4737b077deb..f03d394d6d7 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -27,6 +27,7 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/exec_policy.hpp>
@@ -194,7 +195,7 @@ std::unique_ptr<column> dispatch_index_of(lists_column_view const& lists,
   // comparisons.
   auto const child = lists.child();
 
-  CUDF_EXPECTS(child.type() == search_keys.type(),
+  CUDF_EXPECTS(cudf::have_same_types(child, search_keys),
                "Type/Scale of search key does not match list column element type.",
                cudf::data_type_error);
   CUDF_EXPECTS(search_keys.type().id() != type_id::EMPTY, "Type cannot be empty.");
diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu
index cb14ae7619b..7d57d8ddb60 100644
--- a/cpp/src/lists/sequences.cu
+++ b/cpp/src/lists/sequences.cu
@@ -23,6 +23,8 @@
 #include <cudf/lists/filling.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -139,15 +141,18 @@ std::unique_ptr<column> sequences(column_view const& starts,
                "starts and sizes input columns must not have nulls.");
   CUDF_EXPECTS(starts.size() == sizes.size(),
                "starts and sizes input columns must have the same number of rows.");
-  CUDF_EXPECTS(cudf::is_index_type(sizes.type()), "Input sizes column must be of integer types.");
+  CUDF_EXPECTS(cudf::is_index_type(sizes.type()),
+               "Input sizes column must be of integer types.",
+               cudf::data_type_error);
 
   if (steps) {
     auto const& steps_cv = steps.value();
     CUDF_EXPECTS(!steps_cv.has_nulls(), "steps input column must not have nulls.");
     CUDF_EXPECTS(starts.size() == steps_cv.size(),
                  "starts and steps input columns must have the same number of rows.");
-    CUDF_EXPECTS(starts.type() == steps_cv.type(),
-                 "starts and steps input columns must have the same type.");
+    CUDF_EXPECTS(cudf::have_same_types(starts, steps_cv),
+                 "starts and steps input columns must have the same type.",
+                 cudf::data_type_error);
   }
 
   auto const n_lists = starts.size();
diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu
index f3352a3a52d..1d18b8c677c 100644
--- a/cpp/src/lists/set_operations.cu
+++ b/cpp/src/lists/set_operations.cu
@@ -52,7 +52,7 @@ namespace {
 void check_compatibility(lists_column_view const& lhs, lists_column_view const& rhs)
 {
   CUDF_EXPECTS(lhs.size() == rhs.size(), "The input lists column must have the same size.");
-  CUDF_EXPECTS(column_types_equal(lhs.child(), rhs.child()),
+  CUDF_EXPECTS(have_same_types(lhs.child(), rhs.child()),
                "The input lists columns must have children having the same type structure");
 }
 
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 5a3be259ed9..630cf328579 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -34,6 +34,7 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index d764ea7559f..cde0274339a 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -28,6 +28,8 @@
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
@@ -154,8 +156,9 @@ std::unique_ptr<scalar> reduce(column_view const& col,
                                rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(!init.has_value() || col.type() == init.value().get().type(),
-               "column and initial value must be the same type");
+  CUDF_EXPECTS(!init.has_value() || cudf::have_same_types(col, init.value().get()),
+               "column and initial value must be the same type",
+               cudf::data_type_error);
   if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT ||
                             agg.kind == aggregation::MIN || agg.kind == aggregation::MAX ||
                             agg.kind == aggregation::ANY || agg.kind == aggregation::ALL)) {
diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index dee16b3e503..1ae344dcace 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -22,6 +22,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
@@ -112,8 +113,9 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                                          rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(!init.has_value() || segmented_values.type() == init.value().get().type(),
-               "column and initial value must be the same type");
+  CUDF_EXPECTS(!init.has_value() || cudf::have_same_types(segmented_values, init.value().get()),
+               "column and initial value must be the same type",
+               cudf::data_type_error);
   if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT ||
                             agg.kind == aggregation::MIN || agg.kind == aggregation::MAX ||
                             agg.kind == aggregation::ANY || agg.kind == aggregation::ALL)) {
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index 31ffc76a4a5..cb3caf9d068 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -33,6 +33,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -192,7 +194,9 @@ struct dispatch_clamp {
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr)
   {
-    CUDF_EXPECTS(lo.type() == input.type(), "mismatching types of scalar and input");
+    CUDF_EXPECTS(cudf::have_same_types(input, lo),
+                 "mismatching types of scalar and input",
+                 cudf::data_type_error);
 
     auto lo_itr         = make_optional_iterator<T>(lo, nullate::YES{});
     auto hi_itr         = make_optional_iterator<T>(hi, nullate::YES{});
@@ -316,9 +320,14 @@ std::unique_ptr<column> clamp(column_view const& input,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(lo.type() == hi.type(), "mismatching types of limit scalars");
-  CUDF_EXPECTS(lo_replace.type() == hi_replace.type(), "mismatching types of replace scalars");
-  CUDF_EXPECTS(lo.type() == lo_replace.type(), "mismatching types of limit and replace scalars");
+  CUDF_EXPECTS(
+    cudf::have_same_types(lo, hi), "mismatching types of limit scalars", cudf::data_type_error);
+  CUDF_EXPECTS(cudf::have_same_types(lo_replace, hi_replace),
+               "mismatching types of replace scalars",
+               cudf::data_type_error);
+  CUDF_EXPECTS(cudf::have_same_types(lo, lo_replace),
+               "mismatching types of limit and replace scalars",
+               cudf::data_type_error);
 
   if ((not lo.is_valid(stream) and not hi.is_valid(stream)) or (input.is_empty())) {
     // There will be no change
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index fe3d20e372e..13e130588c1 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -38,6 +38,7 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -216,7 +217,8 @@ struct replace_nulls_scalar_kernel_forwarder {
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr)
   {
-    CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch");
+    CUDF_EXPECTS(
+      cudf::have_same_types(input, replacement), "Data type mismatch", cudf::data_type_error);
     std::unique_ptr<cudf::column> output = cudf::detail::allocate_like(
       input, input.size(), cudf::mask_allocation_policy::NEVER, stream, mr);
     auto output_view = output->mutable_view();
@@ -252,9 +254,10 @@ std::unique_ptr<cudf::column> replace_nulls_scalar_kernel_forwarder::operator()<
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch");
+  CUDF_EXPECTS(
+    cudf::have_same_types(input, replacement), "Data type mismatch", cudf::data_type_error);
   cudf::strings_column_view input_s(input);
-  cudf::string_scalar const& repl = static_cast<cudf::string_scalar const&>(replacement);
+  auto const& repl = static_cast<cudf::string_scalar const&>(replacement);
   return cudf::strings::detail::replace_nulls(input_s, repl, stream, mr);
 }
 
@@ -318,7 +321,8 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             rmm::cuda_stream_view stream,
                                             rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch");
+  CUDF_EXPECTS(
+    cudf::have_same_types(input, replacement), "Data type mismatch", cudf::data_type_error);
   CUDF_EXPECTS(replacement.size() == input.size(), "Column size mismatch");
 
   if (input.is_empty()) { return cudf::empty_like(input); }
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 7bc0bd7e0be..c2cd03cd761 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -48,6 +48,7 @@
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -303,9 +304,10 @@ std::unique_ptr<cudf::column> find_and_replace_all(cudf::column_view const& inpu
   CUDF_EXPECTS(values_to_replace.size() == replacement_values.size(),
                "values_to_replace and replacement_values size mismatch.");
 
-  CUDF_EXPECTS(
-    input_col.type() == values_to_replace.type() && input_col.type() == replacement_values.type(),
-    "Columns type mismatch");
+  CUDF_EXPECTS(cudf::have_same_types(input_col, values_to_replace) &&
+                 cudf::have_same_types(input_col, replacement_values),
+               "Columns type mismatch",
+               cudf::data_type_error);
   CUDF_EXPECTS(not values_to_replace.has_nulls(), "values_to_replace must not have nulls");
 
   if (input_col.is_empty() or values_to_replace.is_empty() or replacement_values.is_empty()) {
diff --git a/cpp/src/rolling/detail/lead_lag_nested.cuh b/cpp/src/rolling/detail/lead_lag_nested.cuh
index 269868910c7..cfedcac8ae4 100644
--- a/cpp/src/rolling/detail/lead_lag_nested.cuh
+++ b/cpp/src/rolling/detail/lead_lag_nested.cuh
@@ -23,7 +23,9 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/scatter.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -99,8 +101,9 @@ std::unique_ptr<column> compute_lead_lag_for_nested(aggregation::Kind op,
 {
   CUDF_EXPECTS(op == aggregation::LEAD || op == aggregation::LAG,
                "Unexpected aggregation type in compute_lead_lag_for_nested");
-  CUDF_EXPECTS(default_outputs.type().id() == input.type().id(),
-               "Defaults column type must match input column.");  // Because LEAD/LAG.
+  CUDF_EXPECTS(cudf::have_same_types(input, default_outputs),
+               "Defaults column type must match input column.",
+               cudf::data_type_error);  // Because LEAD/LAG.
 
   CUDF_EXPECTS(default_outputs.is_empty() || (input.size() == default_outputs.size()),
                "Number of defaults must match input column.");
diff --git a/cpp/src/search/contains_scalar.cu b/cpp/src/search/contains_scalar.cu
index 0b344ec347b..e88acf68e28 100644
--- a/cpp/src/search/contains_scalar.cu
+++ b/cpp/src/search/contains_scalar.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,8 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -62,7 +64,9 @@ struct contains_scalar_dispatch {
                                                            scalar const& needle,
                                                            rmm::cuda_stream_view stream) const
   {
-    CUDF_EXPECTS(haystack.type() == needle.type(), "Scalar and column types must match");
+    CUDF_EXPECTS(cudf::have_same_types(haystack, needle),
+                 "Scalar and column types must match",
+                 cudf::data_type_error);
     // Don't need to check for needle validity. If it is invalid, it should be handled by the caller
     // before dispatching to this function.
 
@@ -87,7 +91,9 @@ struct contains_scalar_dispatch {
                                                           scalar const& needle,
                                                           rmm::cuda_stream_view stream) const
   {
-    CUDF_EXPECTS(haystack.type() == needle.type(), "Scalar and column types must match");
+    CUDF_EXPECTS(cudf::have_same_types(haystack, needle),
+                 "Scalar and column types must match",
+                 cudf::data_type_error);
     // Don't need to check for needle validity. If it is invalid, it should be handled by the caller
     // before dispatching to this function.
     // In addition, haystack and needle structure compatibility will be checked later on by
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index 13417fdab63..466f9093194 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -22,6 +22,7 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index 2f7564b3b0d..972a4ffd58e 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -26,6 +26,8 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
@@ -228,13 +230,17 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                "Parameter starts must have the same number of rows as strings.");
   CUDF_EXPECTS(stops_column.size() == strings_count,
                "Parameter stops must have the same number of rows as strings.");
-  CUDF_EXPECTS(starts_column.type() == stops_column.type(),
-               "Parameters starts and stops must be of the same type.");
+  CUDF_EXPECTS(cudf::have_same_types(starts_column, stops_column),
+               "Parameters starts and stops must be of the same type.",
+               cudf::data_type_error);
   CUDF_EXPECTS(starts_column.null_count() == 0, "Parameter starts must not contain nulls.");
   CUDF_EXPECTS(stops_column.null_count() == 0, "Parameter stops must not contain nulls.");
   CUDF_EXPECTS(starts_column.type().id() != data_type{type_id::BOOL8}.id(),
-               "Positions values must not be bool type.");
-  CUDF_EXPECTS(is_fixed_width(starts_column.type()), "Positions values must be fixed width type.");
+               "Positions values must not be bool type.",
+               cudf::data_type_error);
+  CUDF_EXPECTS(is_fixed_width(starts_column.type()),
+               "Positions values must be fixed width type.",
+               cudf::data_type_error);
 
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto starts_iter    = cudf::detail::indexalator_factory::make_input_iterator(starts_column);
diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp
index bcbf2d44139..13832b0d9dc 100644
--- a/cpp/src/table/table_view.cpp
+++ b/cpp/src/table/table_view.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,30 +145,21 @@ bool has_nested_nullable_columns(table_view const& input)
   });
 }
 
-bool have_same_types(table_view const& lhs, table_view const& rhs)
+namespace detail {
+
+template <typename TableView>
+bool is_relationally_comparable(TableView const& lhs, TableView const& rhs)
 {
   return std::equal(lhs.begin(),
                     lhs.end(),
                     rhs.begin(),
                     rhs.end(),
                     [](column_view const& lcol, column_view const& rcol) {
-                      return cudf::column_types_equal(lcol, rcol);
+                      return cudf::is_relationally_comparable(lcol.type()) and
+                             cudf::have_same_types(lcol, rcol);
                     });
 }
 
-namespace detail {
-
-template <typename TableView>
-bool is_relationally_comparable(TableView const& lhs, TableView const& rhs)
-{
-  return std::all_of(thrust::counting_iterator<size_type>(0),
-                     thrust::counting_iterator<size_type>(lhs.num_columns()),
-                     [lhs, rhs](auto const i) {
-                       return lhs.column(i).type() == rhs.column(i).type() and
-                              cudf::is_relationally_comparable(lhs.column(i).type());
-                     });
-}
-
 // Explicit template instantiation for a table of immutable views
 template bool is_relationally_comparable<table_view>(table_view const& lhs, table_view const& rhs);
 
diff --git a/cpp/src/transform/one_hot_encode.cu b/cpp/src/transform/one_hot_encode.cu
index 570060b3870..723c306da1d 100644
--- a/cpp/src/transform/one_hot_encode.cu
+++ b/cpp/src/transform/one_hot_encode.cu
@@ -24,6 +24,7 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -61,7 +62,9 @@ std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const&
                                                               rmm::cuda_stream_view stream,
                                                               rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(input.type() == categories.type(), "Mismatch type between input and categories.");
+  CUDF_EXPECTS(cudf::have_same_types(input, categories),
+               "Mismatch type between input and categories.",
+               cudf::data_type_error);
 
   if (categories.is_empty()) { return {make_empty_column(type_id::BOOL8), table_view{}}; }
 
diff --git a/cpp/src/utilities/type_checks.cpp b/cpp/src/utilities/type_checks.cpp
index d6f5c65593a..dac981fb532 100644
--- a/cpp/src/utilities/type_checks.cpp
+++ b/cpp/src/utilities/type_checks.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -28,15 +30,16 @@ namespace {
 
 struct columns_equal_fn {
   template <typename T>
-  bool operator()(column_view const&, column_view const&)
+  bool operator()(column_view const& lhs, column_view const& rhs)
   {
-    return true;
+    return lhs.type() == rhs.type();
   }
 };
 
 template <>
 bool columns_equal_fn::operator()<dictionary32>(column_view const& lhs, column_view const& rhs)
 {
+  if (not cudf::is_dictionary(rhs.type())) { return false; }
   auto const kidx = dictionary_column_view::keys_column_index;
   return lhs.num_children() > 0 and rhs.num_children() > 0
            ? lhs.child(kidx).type() == rhs.child(kidx).type()
@@ -46,33 +49,132 @@ bool columns_equal_fn::operator()<dictionary32>(column_view const& lhs, column_v
 template <>
 bool columns_equal_fn::operator()<list_view>(column_view const& lhs, column_view const& rhs)
 {
+  if (rhs.type().id() != type_id::LIST) { return false; }
   auto const& ci = lists_column_view::child_column_index;
-  return column_types_equal(lhs.child(ci), rhs.child(ci));
+  return have_same_types(lhs.child(ci), rhs.child(ci));
 }
 
 template <>
 bool columns_equal_fn::operator()<struct_view>(column_view const& lhs, column_view const& rhs)
 {
-  return lhs.num_children() == rhs.num_children() and
-         std::all_of(thrust::make_counting_iterator(0),
-                     thrust::make_counting_iterator(lhs.num_children()),
-                     [&](auto i) { return column_types_equal(lhs.child(i), rhs.child(i)); });
+  if (rhs.type().id() != type_id::STRUCT) { return false; }
+  return std::equal(lhs.child_begin(),
+                    lhs.child_end(),
+                    rhs.child_begin(),
+                    rhs.child_end(),
+                    [](auto const& lhs, auto const& rhs) { return have_same_types(lhs, rhs); });
+}
+
+struct column_scalar_equal_fn {
+  template <typename T>
+  bool operator()(column_view const& col, scalar const& slr)
+  {
+    return col.type() == slr.type();
+  }
+};
+
+template <>
+bool column_scalar_equal_fn::operator()<dictionary32>(column_view const& col, scalar const& slr)
+{
+  // It is not possible to have a scalar dictionary, so compare the dictionary
+  // column keys type to the scalar type.
+  auto col_keys = cudf::dictionary_column_view(col).keys();
+  return have_same_types(col_keys, slr);
+}
+
+template <>
+bool column_scalar_equal_fn::operator()<list_view>(column_view const& col, scalar const& slr)
+{
+  if (slr.type().id() != type_id::LIST) { return false; }
+  auto const& ci      = lists_column_view::child_column_index;
+  auto const list_slr = static_cast<list_scalar const*>(&slr);
+  return have_same_types(col.child(ci), list_slr->view());
+}
+
+template <>
+bool column_scalar_equal_fn::operator()<struct_view>(column_view const& col, scalar const& slr)
+{
+  if (slr.type().id() != type_id::STRUCT) { return false; }
+  auto const struct_slr = static_cast<struct_scalar const*>(&slr);
+  auto const slr_tbl    = struct_slr->view();
+  return std::equal(col.child_begin(),
+                    col.child_end(),
+                    slr_tbl.begin(),
+                    slr_tbl.end(),
+                    [](auto const& lhs, auto const& rhs) { return have_same_types(lhs, rhs); });
+}
+
+struct scalars_equal_fn {
+  template <typename T>
+  bool operator()(scalar const& lhs, scalar const& rhs)
+  {
+    return lhs.type() == rhs.type();
+  }
+};
+
+template <>
+bool scalars_equal_fn::operator()<list_view>(scalar const& lhs, scalar const& rhs)
+{
+  if (rhs.type().id() != type_id::LIST) { return false; }
+  auto const list_lhs = static_cast<list_scalar const*>(&lhs);
+  auto const list_rhs = static_cast<list_scalar const*>(&rhs);
+  return have_same_types(list_lhs->view(), list_rhs->view());
+}
+
+template <>
+bool scalars_equal_fn::operator()<struct_view>(scalar const& lhs, scalar const& rhs)
+{
+  if (rhs.type().id() != type_id::STRUCT) { return false; }
+  auto const tbl_lhs = static_cast<struct_scalar const*>(&lhs)->view();
+  auto const tbl_rhs = static_cast<struct_scalar const*>(&rhs)->view();
+  return have_same_types(tbl_lhs, tbl_rhs);
 }
 
 };  // namespace
 
 // Implementation note: avoid using double dispatch for this function
 // as it increases code paths to NxN for N types.
-bool column_types_equal(column_view const& lhs, column_view const& rhs)
+bool have_same_types(column_view const& lhs, column_view const& rhs)
 {
-  if (lhs.type() != rhs.type()) { return false; }
   return type_dispatcher(lhs.type(), columns_equal_fn{}, lhs, rhs);
 }
 
+bool column_types_equal(column_view const& lhs, column_view const& rhs)
+{
+  return have_same_types(lhs, rhs);
+}
+
+bool have_same_types(column_view const& lhs, scalar const& rhs)
+{
+  return type_dispatcher(lhs.type(), column_scalar_equal_fn{}, lhs, rhs);
+}
+
+bool have_same_types(scalar const& lhs, column_view const& rhs)
+{
+  return have_same_types(rhs, lhs);
+}
+
+bool have_same_types(scalar const& lhs, scalar const& rhs)
+{
+  return type_dispatcher(lhs.type(), scalars_equal_fn{}, lhs, rhs);
+}
+
+bool have_same_types(table_view const& lhs, table_view const& rhs)
+{
+  return std::equal(
+    lhs.begin(),
+    lhs.end(),
+    rhs.begin(),
+    rhs.end(),
+    [](column_view const& lcol, column_view const& rcol) { return have_same_types(lcol, rcol); });
+}
+
 bool column_types_equivalent(column_view const& lhs, column_view const& rhs)
 {
-  if (lhs.type().id() != rhs.type().id()) { return false; }
-  return type_dispatcher(lhs.type(), columns_equal_fn{}, lhs, rhs);
+  // Check if the columns have fixed point types. This is the only case where
+  // type equality and equivalence differ.
+  if (cudf::is_fixed_point(lhs.type())) { return lhs.type().id() == rhs.type().id(); }
+  return have_same_types(lhs, rhs);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index c2d1e1d9f4f..a9bf22682cf 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -31,6 +31,7 @@
 #include <cudf/filling.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 
@@ -1226,7 +1227,7 @@ TEST_F(ListsColumnTest, ConcatenateMismatchedHierarchies)
     cudf::test::lists_column_wrapper<int> b{{{LCW{}}}};
     cudf::test::lists_column_wrapper<int> c{{LCW{}}};
 
-    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::logic_error);
+    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::data_type_error);
   }
 
   {
@@ -1235,7 +1236,7 @@ TEST_F(ListsColumnTest, ConcatenateMismatchedHierarchies)
     cudf::test::lists_column_wrapper<int> b{{{LCW{}}}};
     cudf::test::lists_column_wrapper<int> c{{LCW{}}};
 
-    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::logic_error);
+    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::data_type_error);
   }
 
   {
@@ -1243,14 +1244,14 @@ TEST_F(ListsColumnTest, ConcatenateMismatchedHierarchies)
     cudf::test::lists_column_wrapper<int> b{1, 2, 3};
     cudf::test::lists_column_wrapper<int> c{{3, 4, 5}};
 
-    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::logic_error);
+    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::data_type_error);
   }
 
   {
     cudf::test::lists_column_wrapper<int> a{{{1, 2, 3}}};
     cudf::test::lists_column_wrapper<int> b{{4, 5}};
 
-    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b})), cudf::logic_error);
+    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b})), cudf::data_type_error);
   }
 }
 
@@ -1605,7 +1606,7 @@ TEST_F(FixedPointTest, FixedPointScaleMismatch)
   auto const b = fp_wrapper(vec.begin() + 300, vec.begin() + 700, scale_type{-2});
   auto const c = fp_wrapper(vec.begin() + 700, vec.end(), /*****/ scale_type{-3});
 
-  EXPECT_THROW(cudf::concatenate(std::vector<cudf::column_view>{a, b, c}), cudf::logic_error);
+  EXPECT_THROW(cudf::concatenate(std::vector<cudf::column_view>{a, b, c}), cudf::data_type_error);
 }
 
 struct DictionaryConcatTest : public cudf::test::BaseFixture {};
@@ -1650,7 +1651,7 @@ TEST_F(DictionaryConcatTest, ErrorsTest)
   cudf::test::fixed_width_column_wrapper<int32_t> integers({10, 30, 20});
   auto dictionary2 = cudf::dictionary::encode(integers);
   std::vector<cudf::column_view> views({dictionary1->view(), dictionary2->view()});
-  EXPECT_THROW(cudf::concatenate(views), cudf::logic_error);
+  EXPECT_THROW(cudf::concatenate(views), cudf::data_type_error);
   std::vector<cudf::column_view> empty;
   EXPECT_THROW(cudf::concatenate(empty), cudf::logic_error);
 }
diff --git a/cpp/tests/copying/copy_range_tests.cpp b/cpp/tests/copying/copy_range_tests.cpp
index bcc0ac29b3e..223946ddcee 100644
--- a/cpp/tests/copying/copy_range_tests.cpp
+++ b/cpp/tests/copying/copy_range_tests.cpp
@@ -465,7 +465,7 @@ TEST_F(CopyRangeErrorTestFixture, DTypeMismatch)
   auto dict_target = cudf::dictionary::encode(target);
   auto dict_source = cudf::dictionary::encode(source);
   EXPECT_THROW(cudf::copy_range(dict_source->view(), dict_target->view(), 0, 100, 0),
-               cudf::logic_error);
+               cudf::data_type_error);
 }
 
 template <typename T>
diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp
index 138e1935363..f31d8d6f79a 100644
--- a/cpp/tests/copying/copy_tests.cpp
+++ b/cpp/tests/copying/copy_tests.cpp
@@ -712,7 +712,7 @@ TEST_F(DictionaryCopyIfElseTest, TypeMismatch)
   cudf::test::dictionary_column_wrapper<double> input2({1.0, 1.0, 1.0, 1.0});
   cudf::test::fixed_width_column_wrapper<bool> mask({1, 0, 0, 1});
 
-  EXPECT_THROW(cudf::copy_if_else(input1, input2, mask), cudf::logic_error);
+  EXPECT_THROW(cudf::copy_if_else(input1, input2, mask), cudf::data_type_error);
 
   cudf::string_scalar input3{"1"};
   EXPECT_THROW(cudf::copy_if_else(input1, input3, mask), cudf::data_type_error);
diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp
index 2be3c26af1d..99b86c86997 100644
--- a/cpp/tests/copying/get_value_tests.cpp
+++ b/cpp/tests/copying/get_value_tests.cpp
@@ -542,11 +542,6 @@ struct ListGetStructValueTest : public cudf::test::BaseFixture {
     return SCW{{field1, field2, field3}, mask};
   }
 
-  /**
-   * @brief Create a 0-length structs column
-   */
-  SCW zero_length_struct() { return SCW{}; }
-
   /**
    * @brief Concatenate structs columns, allow specifying inputs in `initializer_list`
    */
@@ -653,7 +648,7 @@ TYPED_TEST(ListGetStructValueTest, NonNestedGetNonNullEmpty)
   cudf::size_type index = 2;
   // For well-formed list column, an empty list still holds the complete structure of
   // a 0-length structs column
-  auto expected_data = this->zero_length_struct();
+  auto expected_data = this->make_test_structs_column({}, {}, {}, no_nulls());
 
   auto s       = cudf::get_element(list_column->view(), index);
   auto typed_s = static_cast<cudf::list_scalar const*>(s.get());
@@ -757,8 +752,8 @@ TYPED_TEST(ListGetStructValueTest, NestedGetNonNullEmpty)
   auto list_column_nested =
     this->make_test_lists_column(3, {0, 1, 1, 2}, std::move(list_column), {1, 1, 1});
 
-  auto expected_data =
-    this->make_test_lists_column(0, {0}, this->zero_length_struct().release(), {});
+  auto expected_data = this->make_test_lists_column(
+    0, {0}, this->make_test_structs_column({}, {}, {}, no_nulls()).release(), {});
 
   cudf::size_type index = 1;
   auto s                = cudf::get_element(list_column_nested->view(), index);
diff --git a/cpp/tests/dictionary/add_keys_test.cpp b/cpp/tests/dictionary/add_keys_test.cpp
index 1314375f383..46bf5468922 100644
--- a/cpp/tests/dictionary/add_keys_test.cpp
+++ b/cpp/tests/dictionary/add_keys_test.cpp
@@ -22,6 +22,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/dictionary/update_keys.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <vector>
 
@@ -83,7 +84,7 @@ TEST_F(DictionaryAddKeysTest, Errors)
   auto dictionary = cudf::dictionary::encode(input);
 
   cudf::test::fixed_width_column_wrapper<float> new_keys{1.0, 2.0, 3.0};
-  EXPECT_THROW(cudf::dictionary::add_keys(dictionary->view(), new_keys), cudf::logic_error);
+  EXPECT_THROW(cudf::dictionary::add_keys(dictionary->view(), new_keys), cudf::data_type_error);
   cudf::test::fixed_width_column_wrapper<int64_t> null_keys{{1, 2, 3}, {1, 0, 1}};
   EXPECT_THROW(cudf::dictionary::add_keys(dictionary->view(), null_keys), cudf::logic_error);
 }
diff --git a/cpp/tests/dictionary/remove_keys_test.cpp b/cpp/tests/dictionary/remove_keys_test.cpp
index 13fe3efd0f4..9950a39d630 100644
--- a/cpp/tests/dictionary/remove_keys_test.cpp
+++ b/cpp/tests/dictionary/remove_keys_test.cpp
@@ -22,6 +22,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/dictionary/update_keys.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -119,7 +120,7 @@ TEST_F(DictionaryRemoveKeysTest, Errors)
   auto const dictionary = cudf::dictionary::encode(input);
 
   cudf::test::fixed_width_column_wrapper<float> del_keys{1.0, 2.0, 3.0};
-  EXPECT_THROW(cudf::dictionary::remove_keys(dictionary->view(), del_keys), cudf::logic_error);
+  EXPECT_THROW(cudf::dictionary::remove_keys(dictionary->view(), del_keys), cudf::data_type_error);
   cudf::test::fixed_width_column_wrapper<int64_t> null_keys{{1, 2, 3}, {1, 0, 1}};
   EXPECT_THROW(cudf::dictionary::remove_keys(dictionary->view(), null_keys), cudf::logic_error);
 }
diff --git a/cpp/tests/dictionary/scatter_test.cpp b/cpp/tests/dictionary/scatter_test.cpp
index 2a2841827d0..2f77f4ee621 100644
--- a/cpp/tests/dictionary/scatter_test.cpp
+++ b/cpp/tests/dictionary/scatter_test.cpp
@@ -141,5 +141,5 @@ TEST_F(DictionaryScatterTest, Error)
   EXPECT_THROW(
     cudf::scatter(
       cudf::table_view{{source->view()}}, scatter_map, cudf::table_view{{target->view()}}),
-    cudf::logic_error);
+    cudf::data_type_error);
 }
diff --git a/cpp/tests/dictionary/search_test.cpp b/cpp/tests/dictionary/search_test.cpp
index 600d00ac186..b49b4ce5aa0 100644
--- a/cpp/tests/dictionary/search_test.cpp
+++ b/cpp/tests/dictionary/search_test.cpp
@@ -77,9 +77,9 @@ TEST_F(DictionarySearchTest, Errors)
 {
   cudf::test::dictionary_column_wrapper<int64_t> dictionary({1, 2, 3});
   cudf::numeric_scalar<double> key(7);
-  EXPECT_THROW(cudf::dictionary::get_index(dictionary, key), cudf::logic_error);
+  EXPECT_THROW(cudf::dictionary::get_index(dictionary, key), cudf::data_type_error);
   EXPECT_THROW(
     cudf::dictionary::detail::get_insert_index(
       dictionary, key, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
-    cudf::logic_error);
+    cudf::data_type_error);
 }
diff --git a/cpp/tests/dictionary/set_keys_test.cpp b/cpp/tests/dictionary/set_keys_test.cpp
index d0c37493cf8..5c9ec3567fe 100644
--- a/cpp/tests/dictionary/set_keys_test.cpp
+++ b/cpp/tests/dictionary/set_keys_test.cpp
@@ -21,6 +21,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/dictionary/update_keys.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -82,7 +83,7 @@ TEST_F(DictionarySetKeysTest, Errors)
   auto dictionary = cudf::dictionary::encode(input);
 
   cudf::test::fixed_width_column_wrapper<float> new_keys{1.0, 2.0, 3.0};
-  EXPECT_THROW(cudf::dictionary::set_keys(dictionary->view(), new_keys), cudf::logic_error);
+  EXPECT_THROW(cudf::dictionary::set_keys(dictionary->view(), new_keys), cudf::data_type_error);
   cudf::test::fixed_width_column_wrapper<int64_t> null_keys{{1, 2, 3}, {1, 0, 1}};
   EXPECT_THROW(cudf::dictionary::set_keys(dictionary->view(), null_keys), cudf::logic_error);
 }
diff --git a/cpp/tests/filling/fill_tests.cpp b/cpp/tests/filling/fill_tests.cpp
index 95a27defa4e..26badefe698 100644
--- a/cpp/tests/filling/fill_tests.cpp
+++ b/cpp/tests/filling/fill_tests.cpp
@@ -359,8 +359,8 @@ TEST_F(FillErrorTestFixture, DTypeMismatch)
 
   auto destination_view = cudf::mutable_column_view{destination};
 
-  EXPECT_THROW(cudf::fill_in_place(destination_view, 0, 10, *p_val), cudf::logic_error);
-  EXPECT_THROW(auto p_ret = cudf::fill(destination, 0, 10, *p_val), cudf::logic_error);
+  EXPECT_THROW(cudf::fill_in_place(destination_view, 0, 10, *p_val), cudf::data_type_error);
+  EXPECT_THROW(auto p_ret = cudf::fill(destination, 0, 10, *p_val), cudf::data_type_error);
 }
 
 template <typename T>
diff --git a/cpp/tests/filling/sequence_tests.cpp b/cpp/tests/filling/sequence_tests.cpp
index cf619aace5a..5651a26f192 100644
--- a/cpp/tests/filling/sequence_tests.cpp
+++ b/cpp/tests/filling/sequence_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -102,15 +102,15 @@ TEST_F(SequenceTestFixture, MismatchedInputs)
 {
   cudf::numeric_scalar<int> init(0);
   cudf::numeric_scalar<float> step(-5);
-  EXPECT_THROW(cudf::sequence(10, init, step), cudf::logic_error);
+  EXPECT_THROW(cudf::sequence(10, init, step), cudf::data_type_error);
 
   cudf::numeric_scalar<int> init2(0);
   cudf::numeric_scalar<int8_t> step2(-5);
-  EXPECT_THROW(cudf::sequence(10, init2, step2), cudf::logic_error);
+  EXPECT_THROW(cudf::sequence(10, init2, step2), cudf::data_type_error);
 
   cudf::numeric_scalar<float> init3(0);
   cudf::numeric_scalar<double> step3(-5);
-  EXPECT_THROW(cudf::sequence(10, init3, step3), cudf::logic_error);
+  EXPECT_THROW(cudf::sequence(10, init3, step3), cudf::data_type_error);
 }
 
 TYPED_TEST(SequenceTypedTestFixture, DefaultStep)
diff --git a/cpp/tests/groupby/shift_tests.cpp b/cpp/tests/groupby/shift_tests.cpp
index d2ecb667eca..1a6abf2e734 100644
--- a/cpp/tests/groupby/shift_tests.cpp
+++ b/cpp/tests/groupby/shift_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -507,7 +507,7 @@ TEST_F(groupby_shift_fixed_point_type_test, MismatchScaleType)
 
   EXPECT_THROW(test_groupby_shift_multi(
                  key, cudf::table_view{{v1}}, offset, {*slr1}, cudf::table_view{{stub}}),
-               cudf::logic_error);
+               cudf::data_type_error);
 }
 
 TEST_F(groupby_shift_fixed_point_type_test, MismatchRepType)
@@ -525,5 +525,5 @@ TEST_F(groupby_shift_fixed_point_type_test, MismatchRepType)
 
   EXPECT_THROW(test_groupby_shift_multi(
                  key, cudf::table_view{{v1}}, offset, {*slr1}, cudf::table_view{{stub}}),
-               cudf::logic_error);
+               cudf::data_type_error);
 }
diff --git a/cpp/tests/interop/dlpack_test.cpp b/cpp/tests/interop/dlpack_test.cpp
index 895887ee348..ecc8558243d 100644
--- a/cpp/tests/interop/dlpack_test.cpp
+++ b/cpp/tests/interop/dlpack_test.cpp
@@ -20,6 +20,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/interop.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <thrust/host_vector.h>
 
@@ -98,7 +99,7 @@ TEST_F(DLPackUntypedTests, MultipleTypesToDlpack)
   cudf::test::fixed_width_column_wrapper<int16_t> col1({1, 2, 3, 4});
   cudf::test::fixed_width_column_wrapper<int32_t> col2({1, 2, 3, 4});
   cudf::table_view input({col1, col2});
-  EXPECT_THROW(cudf::to_dlpack(input), cudf::logic_error);
+  EXPECT_THROW(cudf::to_dlpack(input), cudf::data_type_error);
 }
 
 TEST_F(DLPackUntypedTests, InvalidNullsToDlpack)
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index 3a8763ed9f3..fd8484bc70f 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -567,9 +567,7 @@ TEST_F(ParquetWriterTest, EmptyList)
   auto result = cudf::io::read_parquet(
     cudf::io::parquet_reader_options_builder(cudf::io::source_info(filepath)));
 
-  using lcw     = cudf::test::lists_column_wrapper<int64_t>;
-  auto expected = lcw{lcw{}, lcw{}, lcw{}};
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), L0->view());
 }
 
 TEST_F(ParquetWriterTest, DeepEmptyList)
diff --git a/cpp/tests/labeling/label_bins_tests.cpp b/cpp/tests/labeling/label_bins_tests.cpp
index 2ac6ad5dd0d..1a9e74df9be 100644
--- a/cpp/tests/labeling/label_bins_tests.cpp
+++ b/cpp/tests/labeling/label_bins_tests.cpp
@@ -25,6 +25,7 @@
 #include <cudf/copying.hpp>
 #include <cudf/labeling/label_bins.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <algorithm>
 #include <limits>
@@ -64,7 +65,7 @@ TEST(BinColumnErrorTests, TestInvalidLeft)
 
   EXPECT_THROW(
     cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO),
-    cudf::logic_error);
+    cudf::data_type_error);
 };
 
 // Right edges type check.
@@ -76,7 +77,7 @@ TEST(BinColumnErrorTests, TestInvalidRight)
 
   EXPECT_THROW(
     cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO),
-    cudf::logic_error);
+    cudf::data_type_error);
 };
 
 // Input type check.
@@ -88,7 +89,7 @@ TEST(BinColumnErrorTests, TestInvalidInput)
 
   EXPECT_THROW(
     cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO),
-    cudf::logic_error);
+    cudf::data_type_error);
 };
 
 // Number of left and right edges must match.
diff --git a/cpp/tests/lists/combine/concatenate_rows_tests.cpp b/cpp/tests/lists/combine/concatenate_rows_tests.cpp
index 008003a08a1..bf088eb855a 100644
--- a/cpp/tests/lists/combine/concatenate_rows_tests.cpp
+++ b/cpp/tests/lists/combine/concatenate_rows_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/lists/combine.hpp>
+#include <cudf/utilities/error.hpp>
 
 using namespace cudf::test::iterators;
 
@@ -53,7 +54,7 @@ TEST_F(ListConcatenateRowsTest, InvalidInput)
     auto const col1 = IntListsCol{}.release();
     auto const col2 = StrListsCol{}.release();
     EXPECT_THROW(cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}}),
-                 cudf::logic_error);
+                 cudf::data_type_error);
   }
 }
 
diff --git a/cpp/tests/lists/sequences_tests.cpp b/cpp/tests/lists/sequences_tests.cpp
index e97600a76d3..74545903eb3 100644
--- a/cpp/tests/lists/sequences_tests.cpp
+++ b/cpp/tests/lists/sequences_tests.cpp
@@ -22,6 +22,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/lists/filling.hpp>
+#include <cudf/utilities/error.hpp>
 
 using namespace cudf::test::iterators;
 
@@ -200,8 +201,8 @@ TEST_F(NumericSequencesTest, InvalidSizesInput)
   auto const steps  = IntsCol{};
   auto const sizes  = FWDCol<float>{};
 
-  EXPECT_THROW(cudf::lists::sequences(starts, sizes), cudf::logic_error);
-  EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::logic_error);
+  EXPECT_THROW(cudf::lists::sequences(starts, sizes), cudf::data_type_error);
+  EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::data_type_error);
 }
 
 TEST_F(NumericSequencesTest, MismatchedColumnSizesInput)
@@ -220,7 +221,7 @@ TEST_F(NumericSequencesTest, MismatchedColumnTypesInput)
   auto const steps  = FWDCol<float>{1, 2, 3};
   auto const sizes  = IntsCol{1, 2, 3};
 
-  EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::logic_error);
+  EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::data_type_error);
 }
 
 TEST_F(NumericSequencesTest, InputHasNulls)
diff --git a/cpp/tests/replace/clamp_test.cpp b/cpp/tests/replace/clamp_test.cpp
index bb33de1f1e7..239c9ce6ddd 100644
--- a/cpp/tests/replace/clamp_test.cpp
+++ b/cpp/tests/replace/clamp_test.cpp
@@ -25,6 +25,7 @@
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/replace.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -41,7 +42,7 @@ TEST_F(ClampErrorTest, MisMatchingScalarTypes)
 
   cudf::test::fixed_width_column_wrapper<int32_t> input({1, 2, 3, 4, 5, 6});
 
-  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::logic_error);
+  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::data_type_error);
 }
 
 TEST_F(ClampErrorTest, MisMatchingInputAndScalarTypes)
@@ -53,7 +54,7 @@ TEST_F(ClampErrorTest, MisMatchingInputAndScalarTypes)
 
   cudf::test::fixed_width_column_wrapper<int64_t> input({1, 2, 3, 4, 5, 6});
 
-  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::logic_error);
+  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::data_type_error);
 }
 
 TEST_F(ClampErrorTest, MisMatchingReplaceScalarTypes)
@@ -69,7 +70,7 @@ TEST_F(ClampErrorTest, MisMatchingReplaceScalarTypes)
 
   cudf::test::fixed_width_column_wrapper<int64_t> input({1, 2, 3, 4, 5, 6});
 
-  EXPECT_THROW(cudf::clamp(input, *lo, *lo_replace, *hi, *hi_replace), cudf::logic_error);
+  EXPECT_THROW(cudf::clamp(input, *lo, *lo_replace, *hi, *hi_replace), cudf::data_type_error);
 }
 
 TEST_F(ClampErrorTest, InValidCase1)
@@ -640,7 +641,7 @@ TYPED_TEST(FixedPointTest, MismatchedScalarScales)
   auto const hi    = cudf::make_fixed_point_scalar<decimalXX>(8, scale);
   auto const input = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, scale};
 
-  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::logic_error);
+  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::data_type_error);
 }
 
 TYPED_TEST(FixedPointTest, MismatchedColumnScalarScale)
@@ -655,7 +656,7 @@ TYPED_TEST(FixedPointTest, MismatchedColumnScalarScale)
   auto const hi    = cudf::make_fixed_point_scalar<decimalXX>(8, scale);
   auto const input = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, scale_type{-4}};
 
-  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::logic_error);
+  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::data_type_error);
 }
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp
index 6c23dd6bdc8..9603ea44a76 100644
--- a/cpp/tests/replace/replace_nulls_tests.cpp
+++ b/cpp/tests/replace/replace_nulls_tests.cpp
@@ -58,7 +58,7 @@ TEST_F(ReplaceErrorTest, TypeMismatch)
   cudf::test::fixed_width_column_wrapper<float> values_to_replace_column{
     {10, 11, 12, 13, 14, 15, 16, 17}};
 
-  EXPECT_THROW(cudf::replace_nulls(input_column, values_to_replace_column), cudf::logic_error);
+  EXPECT_THROW(cudf::replace_nulls(input_column, values_to_replace_column), cudf::data_type_error);
 }
 
 // Error: column type mismatch
@@ -68,7 +68,7 @@ TEST_F(ReplaceErrorTest, TypeMismatchScalar)
                                                                {0, 0, 1, 1, 1, 1, 1, 1}};
   cudf::numeric_scalar<float> replacement(1);
 
-  EXPECT_THROW(cudf::replace_nulls(input_column, replacement), cudf::logic_error);
+  EXPECT_THROW(cudf::replace_nulls(input_column, replacement), cudf::data_type_error);
 }
 
 struct ReplaceNullsStringsTest : public cudf::test::BaseFixture {};
@@ -659,14 +659,14 @@ TEST_F(ReplaceDictionaryTest, ReplaceNullsError)
   cudf::test::fixed_width_column_wrapper<int64_t> replacement_w({1, 2, 3, 4});
   auto replacement = cudf::dictionary::encode(replacement_w);
 
-  EXPECT_THROW(cudf::replace_nulls(input->view(), replacement->view()), cudf::logic_error);
-  EXPECT_THROW(cudf::replace_nulls(input->view(), cudf::string_scalar("x")), cudf::logic_error);
+  EXPECT_THROW(cudf::replace_nulls(input->view(), replacement->view()), cudf::data_type_error);
+  EXPECT_THROW(cudf::replace_nulls(input->view(), cudf::string_scalar("x")), cudf::data_type_error);
 
   cudf::test::fixed_width_column_wrapper<int64_t> input_one_w({1}, {0});
   auto input_one  = cudf::dictionary::encode(input_one_w);
   auto dict_input = cudf::dictionary_column_view(input_one->view());
   auto dict_repl  = cudf::dictionary_column_view(replacement->view());
-  EXPECT_THROW(cudf::replace_nulls(input->view(), replacement->view()), cudf::logic_error);
+  EXPECT_THROW(cudf::replace_nulls(input->view(), replacement->view()), cudf::data_type_error);
 }
 
 TEST_F(ReplaceDictionaryTest, ReplaceNullsEmpty)
diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp
index 613034efc12..1858cd7782e 100644
--- a/cpp/tests/replace/replace_tests.cpp
+++ b/cpp/tests/replace/replace_tests.cpp
@@ -30,6 +30,7 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/replace.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <thrust/host_vector.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -63,7 +64,7 @@ TEST_F(ReplaceErrorTest, TypeMismatch)
 
   EXPECT_THROW(
     cudf::find_and_replace_all(input_column, values_to_replace_column, replacement_values_column),
-    cudf::logic_error);
+    cudf::data_type_error);
 }
 
 // Error: nulls in old-values
diff --git a/cpp/tests/transform/one_hot_encode_tests.cpp b/cpp/tests/transform/one_hot_encode_tests.cpp
index 1015370fe4b..8384cb3480b 100644
--- a/cpp/tests/transform/one_hot_encode_tests.cpp
+++ b/cpp/tests/transform/one_hot_encode_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <cudf/table/table_view.hpp>
 #include <cudf/transform.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <limits>
 
@@ -198,7 +199,7 @@ TEST_F(OneHotEncodingTest, MismatchTypes)
   auto input    = cudf::test::strings_column_wrapper{"xx", "yy", "xx"};
   auto category = cudf::test::fixed_width_column_wrapper<int64_t>{1};
 
-  EXPECT_THROW(cudf::one_hot_encode(input, category), cudf::logic_error);
+  EXPECT_THROW(cudf::one_hot_encode(input, category), cudf::data_type_error);
 }
 
 TEST_F(OneHotEncodingTest, List)
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 047b096a283..7cc2777972e 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -31,6 +31,7 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/exec_policy.hpp>
@@ -238,11 +239,6 @@ std::unique_ptr<column> generate_child_row_indices(lists_column_view const& c,
 
 template <bool check_exact_equality>
 struct column_property_comparator {
-  bool types_equivalent(cudf::data_type const& lhs, cudf::data_type const& rhs)
-  {
-    return is_fixed_point(lhs) ? lhs.id() == rhs.id() : lhs == rhs;
-  }
-
   bool compare_common(cudf::column_view const& lhs,
                       cudf::column_view const& rhs,
                       cudf::column_view const& lhs_row_indices,
@@ -252,9 +248,9 @@ struct column_property_comparator {
     bool result = true;
 
     if (check_exact_equality) {
-      PROP_EXPECT_EQ(lhs.type(), rhs.type());
+      PROP_EXPECT_EQ(cudf::have_same_types(lhs, rhs), true);
     } else {
-      PROP_EXPECT_EQ(types_equivalent(lhs.type(), rhs.type()), true);
+      PROP_EXPECT_EQ(cudf::column_types_equivalent(lhs, rhs), true);
     }
 
     auto const lhs_size = check_exact_equality ? lhs.size() : lhs_row_indices.size();
diff --git a/cpp/tests/utilities_tests/type_check_tests.cpp b/cpp/tests/utilities_tests/type_check_tests.cpp
index 9c23798fce6..fecb896f95a 100644
--- a/cpp/tests/utilities_tests/type_check_tests.cpp
+++ b/cpp/tests/utilities_tests/type_check_tests.cpp
@@ -19,13 +19,11 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/table/table_view.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/wrappers/durations.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
-namespace cudf {
-namespace test {
-
 template <typename T>
 struct ColumnTypeCheckTestTyped : public cudf::test::BaseFixture {};
 
@@ -35,56 +33,56 @@ TYPED_TEST_SUITE(ColumnTypeCheckTestTyped, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(ColumnTypeCheckTestTyped, SameFixedWidth)
 {
-  fixed_width_column_wrapper<TypeParam> lhs{1, 1}, rhs{2};
-  EXPECT_TRUE(column_types_equal(lhs, rhs));
+  cudf::test::fixed_width_column_wrapper<TypeParam> lhs{1, 1}, rhs{2};
+  EXPECT_TRUE(cudf::have_same_types(lhs, rhs));
 }
 
 TEST_F(ColumnTypeCheckTest, SameString)
 {
-  strings_column_wrapper lhs{{'a', 'a'}}, rhs{{'b'}};
-  EXPECT_TRUE(column_types_equal(lhs, rhs));
+  cudf::test::strings_column_wrapper lhs{{'a', 'a'}}, rhs{{'b'}};
+  EXPECT_TRUE(cudf::have_same_types(lhs, rhs));
 
-  strings_column_wrapper lhs2{}, rhs2{{'b'}};
-  EXPECT_TRUE(column_types_equal(lhs2, rhs2));
+  cudf::test::strings_column_wrapper lhs2{}, rhs2{{'b'}};
+  EXPECT_TRUE(cudf::have_same_types(lhs2, rhs2));
 
-  strings_column_wrapper lhs3{}, rhs3{};
-  EXPECT_TRUE(column_types_equal(lhs3, rhs3));
+  cudf::test::strings_column_wrapper lhs3{}, rhs3{};
+  EXPECT_TRUE(cudf::have_same_types(lhs3, rhs3));
 }
 
 TEST_F(ColumnTypeCheckTest, SameList)
 {
-  using LCW = lists_column_wrapper<int32_t>;
+  using LCW = cudf::test::lists_column_wrapper<int32_t>;
 
   LCW lhs{}, rhs{};
-  EXPECT_TRUE(column_types_equal(lhs, rhs));
+  EXPECT_TRUE(cudf::have_same_types(lhs, rhs));
 
   LCW lhs2{{1, 2, 3}}, rhs2{{4, 5}};
-  EXPECT_TRUE(column_types_equal(lhs2, rhs2));
+  EXPECT_TRUE(cudf::have_same_types(lhs2, rhs2));
 
   LCW lhs3{{LCW{1}, LCW{2, 3}}}, rhs3{{LCW{4, 5}}};
-  EXPECT_TRUE(column_types_equal(lhs3, rhs3));
+  EXPECT_TRUE(cudf::have_same_types(lhs3, rhs3));
 
   LCW lhs4{{LCW{1}, LCW{}, LCW{2, 3}}}, rhs4{{LCW{4, 5}, LCW{}}};
-  EXPECT_TRUE(column_types_equal(lhs4, rhs4));
+  EXPECT_TRUE(cudf::have_same_types(lhs4, rhs4));
 }
 
 TYPED_TEST(ColumnTypeCheckTestTyped, SameDictionary)
 {
-  using DCW = dictionary_column_wrapper<TypeParam>;
+  using DCW = cudf::test::dictionary_column_wrapper<TypeParam>;
   DCW lhs{1, 1, 2, 3}, rhs{5, 5};
-  EXPECT_TRUE(column_types_equal(lhs, rhs));
+  EXPECT_TRUE(cudf::have_same_types(lhs, rhs));
 
   DCW lhs2{}, rhs2{};
-  EXPECT_TRUE(column_types_equal(lhs2, rhs2));
+  EXPECT_TRUE(cudf::have_same_types(lhs2, rhs2));
 }
 
 TEST_F(ColumnTypeCheckTest, SameStruct)
 {
-  using SCW      = structs_column_wrapper;
-  using FCW      = fixed_width_column_wrapper<int32_t>;
-  using StringCW = strings_column_wrapper;
-  using LCW      = lists_column_wrapper<int32_t>;
-  using DCW      = dictionary_column_wrapper<int32_t>;
+  using SCW      = cudf::test::structs_column_wrapper;
+  using FCW      = cudf::test::fixed_width_column_wrapper<int32_t>;
+  using StringCW = cudf::test::strings_column_wrapper;
+  using LCW      = cudf::test::lists_column_wrapper<int32_t>;
+  using DCW      = cudf::test::dictionary_column_wrapper<int32_t>;
 
   FCW lf1{1, 2, 3}, rf1{0, 1};
   StringCW lf2{"a", "bb", ""}, rf2{"cc", "d"};
@@ -92,127 +90,158 @@ TEST_F(ColumnTypeCheckTest, SameStruct)
   DCW lf4{5, 5, 5}, rf4{9, 9};
 
   SCW lhs{lf1, lf2, lf3, lf4}, rhs{rf1, rf2, rf3, rf4};
-  EXPECT_TRUE(column_types_equal(lhs, rhs));
+  EXPECT_TRUE(cudf::have_same_types(lhs, rhs));
 }
 
 TEST_F(ColumnTypeCheckTest, DifferentBasics)
 {
-  fixed_width_column_wrapper<int32_t> lhs1{1, 1};
-  strings_column_wrapper rhs1{"a", "bb"};
+  cudf::test::fixed_width_column_wrapper<int32_t> lhs1{1, 1};
+  cudf::test::strings_column_wrapper rhs1{"a", "bb"};
 
-  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+  EXPECT_FALSE(cudf::have_same_types(lhs1, rhs1));
 
-  lists_column_wrapper<string_view> lhs2{{"hello"}, {"world", "!"}};
-  strings_column_wrapper rhs2{"", "kk"};
+  cudf::test::lists_column_wrapper<cudf::string_view> lhs2{{"hello"}, {"world", "!"}};
+  cudf::test::strings_column_wrapper rhs2{"", "kk"};
 
-  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+  EXPECT_FALSE(cudf::have_same_types(lhs2, rhs2));
 
-  fixed_width_column_wrapper<int32_t> lhs3{1, 1};
-  dictionary_column_wrapper<int32_t> rhs3{2, 2};
+  cudf::test::fixed_width_column_wrapper<int32_t> lhs3{1, 1};
+  cudf::test::dictionary_column_wrapper<int32_t> rhs3{2, 2};
 
-  EXPECT_FALSE(column_types_equal(lhs3, rhs3));
+  EXPECT_FALSE(cudf::have_same_types(lhs3, rhs3));
 
-  lists_column_wrapper<int32_t> lhs4{{8, 8, 8}, {10, 10}};
-  structs_column_wrapper rhs4{rhs2, rhs3};
+  cudf::test::lists_column_wrapper<int32_t> lhs4{{8, 8, 8}, {10, 10}};
+  cudf::test::structs_column_wrapper rhs4{rhs2, rhs3};
 
-  EXPECT_FALSE(column_types_equal(lhs4, rhs4));
+  EXPECT_FALSE(cudf::have_same_types(lhs4, rhs4));
 }
 
 TEST_F(ColumnTypeCheckTest, DifferentFixedWidth)
 {
-  fixed_width_column_wrapper<int32_t> lhs1{1, 1};
-  fixed_width_column_wrapper<int64_t> rhs1{2};
+  cudf::test::fixed_width_column_wrapper<int32_t> lhs1{1, 1};
+  cudf::test::fixed_width_column_wrapper<int64_t> rhs1{2};
 
-  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+  EXPECT_FALSE(cudf::have_same_types(lhs1, rhs1));
 
-  fixed_width_column_wrapper<float> lhs2{1, 1};
-  fixed_width_column_wrapper<double> rhs2{2};
+  cudf::test::fixed_width_column_wrapper<float> lhs2{1, 1};
+  cudf::test::fixed_width_column_wrapper<double> rhs2{2};
 
-  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+  EXPECT_FALSE(cudf::have_same_types(lhs2, rhs2));
 
-  fixed_width_column_wrapper<timestamp_ms> lhs3{1, 1};
-  fixed_width_column_wrapper<timestamp_us> rhs3{2};
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms> lhs3{1, 1};
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_us> rhs3{2};
 
-  EXPECT_FALSE(column_types_equal(lhs3, rhs3));
+  EXPECT_FALSE(cudf::have_same_types(lhs3, rhs3));
 
-  fixed_width_column_wrapper<duration_D> lhs4{};
-  fixed_width_column_wrapper<duration_us> rhs4{42};
+  cudf::test::fixed_width_column_wrapper<cudf::duration_D> lhs4{};
+  cudf::test::fixed_width_column_wrapper<cudf::duration_us> rhs4{42};
 
-  EXPECT_FALSE(column_types_equal(lhs4, rhs4));
+  EXPECT_FALSE(cudf::have_same_types(lhs4, rhs4));
 
   // Same rep, different scale
-  fixed_point_column_wrapper<int32_t> lhs5({10000}, numeric::scale_type{-3});
-  fixed_point_column_wrapper<int32_t> rhs5({10000}, numeric::scale_type{0});
+  cudf::test::fixed_point_column_wrapper<int32_t> lhs5({10000}, numeric::scale_type{-3});
+  cudf::test::fixed_point_column_wrapper<int32_t> rhs5({10000}, numeric::scale_type{0});
 
-  EXPECT_FALSE(column_types_equal(lhs5, rhs5));
-  EXPECT_TRUE(column_types_equivalent(lhs5, rhs5));
+  EXPECT_FALSE(cudf::have_same_types(lhs5, rhs5));
+  EXPECT_TRUE(cudf::column_types_equivalent(lhs5, rhs5));
 
   // Different rep, same scale
-  fixed_point_column_wrapper<int32_t> lhs6({10000}, numeric::scale_type{-1});
-  fixed_point_column_wrapper<int64_t> rhs6({4200}, numeric::scale_type{-1});
+  cudf::test::fixed_point_column_wrapper<int32_t> lhs6({10000}, numeric::scale_type{-1});
+  cudf::test::fixed_point_column_wrapper<int64_t> rhs6({4200}, numeric::scale_type{-1});
 
-  EXPECT_FALSE(column_types_equal(lhs6, rhs6));
+  EXPECT_FALSE(cudf::have_same_types(lhs6, rhs6));
 }
 
 TEST_F(ColumnTypeCheckTest, DifferentDictionary)
 {
-  dictionary_column_wrapper<int32_t, uint32_t> lhs1{1, 1, 1, 2, 2, 3};
-  dictionary_column_wrapper<int64_t, uint32_t> rhs1{0, 0, 42, 42};
+  cudf::test::dictionary_column_wrapper<int32_t, uint32_t> lhs1{1, 1, 1, 2, 2, 3};
+  cudf::test::dictionary_column_wrapper<int64_t, uint32_t> rhs1{0, 0, 42, 42};
 
-  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+  EXPECT_FALSE(cudf::have_same_types(lhs1, rhs1));
 
-  dictionary_column_wrapper<double, uint32_t> lhs2{3.14, 3.14, 5.00};
-  dictionary_column_wrapper<int64_t, uint32_t> rhs2{0, 0, 42, 42};
+  cudf::test::dictionary_column_wrapper<double, uint32_t> lhs2{3.14, 3.14, 5.00};
+  cudf::test::dictionary_column_wrapper<int64_t, uint32_t> rhs2{0, 0, 42, 42};
 
-  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+  EXPECT_FALSE(cudf::have_same_types(lhs2, rhs2));
 
-  dictionary_column_wrapper<int32_t, uint32_t> lhs3{1, 1, 1, 2, 2, 3};
-  dictionary_column_wrapper<duration_s, uint32_t> rhs3{8, 8};
+  cudf::test::dictionary_column_wrapper<int32_t, uint32_t> lhs3{1, 1, 1, 2, 2, 3};
+  cudf::test::dictionary_column_wrapper<cudf::duration_s, uint32_t> rhs3{8, 8};
 
-  EXPECT_FALSE(column_types_equal(lhs3, rhs3));
+  EXPECT_FALSE(cudf::have_same_types(lhs3, rhs3));
 
-  dictionary_column_wrapper<int32_t, uint32_t> lhs4{1, 1, 2, 3}, rhs4{};
-  EXPECT_FALSE(column_types_equal(lhs4, rhs4));
+  cudf::test::dictionary_column_wrapper<int32_t, uint32_t> lhs4{1, 1, 2, 3}, rhs4{};
+  EXPECT_FALSE(cudf::have_same_types(lhs4, rhs4));
 }
 
 TEST_F(ColumnTypeCheckTest, DifferentLists)
 {
-  using LCW_i = lists_column_wrapper<int32_t>;
-  using LCW_f = lists_column_wrapper<float>;
+  using LCW_i = cudf::test::lists_column_wrapper<int32_t>;
+  using LCW_f = cudf::test::lists_column_wrapper<float>;
 
   // Different nested level
   LCW_i lhs1{LCW_i{1, 1, 2, 3}, LCW_i{}, LCW_i{42, 42}};
   LCW_i rhs1{LCW_i{LCW_i{8, 8, 8}, LCW_i{9, 9}}, LCW_i{LCW_i{42, 42}}};
 
-  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+  EXPECT_FALSE(cudf::have_same_types(lhs1, rhs1));
 
   // Different base column type
   LCW_i lhs2{LCW_i{1, 1, 2, 3}, LCW_i{}, LCW_i{42, 42}};
   LCW_f rhs2{LCW_f{9.0, 9.1}, LCW_f{3.14}, LCW_f{}};
 
-  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+  EXPECT_FALSE(cudf::have_same_types(lhs2, rhs2));
 }
 
 TEST_F(ColumnTypeCheckTest, DifferentStructs)
 {
-  fixed_width_column_wrapper<int32_t> lf1{1, 1, 1};
-  fixed_width_column_wrapper<int64_t> rf1{2, 2};
+  cudf::test::fixed_width_column_wrapper<int32_t> lf1{1, 1, 1};
+  cudf::test::fixed_width_column_wrapper<int64_t> rf1{2, 2};
+
+  cudf::test::structs_column_wrapper lhs1{lf1};
+  cudf::test::structs_column_wrapper rhs1{rf1};
 
-  structs_column_wrapper lhs1{lf1};
-  structs_column_wrapper rhs1{rf1};
+  EXPECT_FALSE(cudf::have_same_types(lhs1, rhs1));
 
-  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+  cudf::test::fixed_width_column_wrapper<int32_t> lf2{1, 1, 1};
+  cudf::test::fixed_width_column_wrapper<int32_t> rf2{2, 2};
 
-  fixed_width_column_wrapper<int32_t> lf2{1, 1, 1};
-  fixed_width_column_wrapper<int32_t> rf2{2, 2};
+  cudf::test::strings_column_wrapper lf3{"a", "b", "c"};
 
-  strings_column_wrapper lf3{"a", "b", "c"};
+  cudf::test::structs_column_wrapper lhs2{lf2, lf3};
+  cudf::test::structs_column_wrapper rhs2{rf2};
 
-  structs_column_wrapper lhs2{lf2, lf3};
-  structs_column_wrapper rhs2{rf2};
+  EXPECT_FALSE(cudf::have_same_types(lhs2, rhs2));
+}
 
-  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+TYPED_TEST(ColumnTypeCheckTestTyped, AllTypesEqual)
+{
+  {
+    // An empty table
+    cudf::table_view tbl{};
+    EXPECT_TRUE(cudf::all_have_same_types(tbl.begin(), tbl.end()));
+  }
+
+  {
+    // A table with one column
+    cudf::test::fixed_width_column_wrapper<TypeParam> col1{1, 2, 3};
+    cudf::table_view tbl{{col1}};
+    EXPECT_TRUE(cudf::all_have_same_types(tbl.begin(), tbl.end()));
+  }
+
+  {
+    // A table with all the same types
+    cudf::test::fixed_width_column_wrapper<TypeParam> col1{1, 2, 3};
+    cudf::test::fixed_width_column_wrapper<TypeParam> col2{4, 5, 6};
+    cudf::test::fixed_width_column_wrapper<TypeParam> col3{7, 8, 9};
+    cudf::table_view tbl{{col1, col2, col3}};
+    EXPECT_TRUE(cudf::all_have_same_types(tbl.begin(), tbl.end()));
+  }
 }
 
-}  // namespace test
-}  // namespace cudf
+TEST_F(ColumnTypeCheckTest, AllTypesNotEqual)
+{
+  // A table with different types
+  cudf::test::fixed_width_column_wrapper<int> col1{1, 2, 3};
+  cudf::test::fixed_width_column_wrapper<float> col2{3.14, 1.57, 2.71};
+  cudf::table_view tbl{{col1, col2}};
+  EXPECT_FALSE(cudf::all_have_same_types(tbl.begin(), tbl.end()));
+}

From 8cc4cc16fc6538f706b9f795d0879bdd0ba442a1 Mon Sep 17 00:00:00 2001
From: Allison Piper <apiper@nvidia.com>
Date: Thu, 2 May 2024 14:50:20 -0400
Subject: [PATCH 149/272] Remove NVBench SHA override. (#15633)

The override is no longer necessary as rapids-cmake now uses the same version that was set by the override.

Refs rapidsai/rapids-cmake#584, #15492

Authors:
  - Allison Piper (https://github.com/alliepiper)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15633
---
 cpp/cmake/thirdparty/get_nvbench.cmake             | 5 +----
 cpp/cmake/thirdparty/patches/nvbench_override.json | 9 ---------
 2 files changed, 1 insertion(+), 13 deletions(-)
 delete mode 100644 cpp/cmake/thirdparty/patches/nvbench_override.json

diff --git a/cpp/cmake/thirdparty/get_nvbench.cmake b/cpp/cmake/thirdparty/get_nvbench.cmake
index bbd22693ba4..84c27dd9d56 100644
--- a/cpp/cmake/thirdparty/get_nvbench.cmake
+++ b/cpp/cmake/thirdparty/get_nvbench.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -18,9 +18,6 @@ function(find_and_configure_nvbench)
   include(${rapids-cmake-dir}/cpm/nvbench.cmake)
   include(${rapids-cmake-dir}/cpm/package_override.cmake)
 
-  set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
-  rapids_cpm_package_override("${cudf_patch_dir}/nvbench_override.json")
-
   rapids_cpm_nvbench(BUILD_STATIC)
 
 endfunction()
diff --git a/cpp/cmake/thirdparty/patches/nvbench_override.json b/cpp/cmake/thirdparty/patches/nvbench_override.json
deleted file mode 100644
index ef0deb4c1e9..00000000000
--- a/cpp/cmake/thirdparty/patches/nvbench_override.json
+++ /dev/null
@@ -1,9 +0,0 @@
-
-{
-  "packages" : {
-    "nvbench" : {
-      "git_url": "https://github.com/NVIDIA/nvbench.git",
-      "git_tag": "555d628e9b250868c9da003e4407087ff1982e8e"
-    }
-  }
-}

From 2ee0219a8255beb7b21628648387e3284a0ee0bc Mon Sep 17 00:00:00 2001
From: Tim Liu <timl@nvidia.com>
Date: Fri, 3 May 2024 03:00:48 +0800
Subject: [PATCH 150/272] Drop Centos7 support (#15608)

To fix https://github.com/rapidsai/cudf/issues/15583

We plan to drop CentOS 7 (which uses glibc 2.17) RAPIDS 24.06. The [java/ci/Dockerfile.centos7](https://github.com/rapidsai/cudf/blob/branch-24.06/java/ci/Dockerfile.centos7), refor or to :

https://docs.rapids.ai/notices/rsn0037/

https://github.com/rapidsai/build-planning/issues/23

Change to run build in Rocky8 docker container

Authors:
  - Tim Liu (https://github.com/NvTimLiu)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15608
---
 build.sh                   |  8 ++---
 java/ci/Dockerfile.centos7 | 56 ----------------------------------
 java/ci/Dockerfile.rocky   | 62 ++++++++++++++++++++++++++++++++++++++
 java/ci/README.md          |  8 ++---
 4 files changed, 70 insertions(+), 64 deletions(-)
 delete mode 100644 java/ci/Dockerfile.centos7
 create mode 100644 java/ci/Dockerfile.rocky

diff --git a/build.sh b/build.sh
index e5daf2f3451..43bb04f7a18 100755
--- a/build.sh
+++ b/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 # cuDF build script
 
@@ -109,8 +109,8 @@ function buildAll {
 }
 
 function buildLibCudfJniInDocker {
-    local cudaVersion="11.5.0"
-    local imageName="cudf-build:${cudaVersion}-devel-centos7"
+    local cudaVersion="11.8.0"
+    local imageName="cudf-build:${cudaVersion}-devel-rocky8"
     local CMAKE_GENERATOR="${CMAKE_GENERATOR:-Ninja}"
     local workspaceDir="/rapids"
     local localMavenRepo=${LOCAL_MAVEN_REPO:-"$HOME/.m2/repository"}
@@ -120,7 +120,7 @@ function buildLibCudfJniInDocker {
     mkdir -p "$CUDF_JAR_JAVA_BUILD_DIR/libcudf-cmake-build"
     mkdir -p "$HOME/.ccache" "$HOME/.m2"
     nvidia-docker build \
-        -f java/ci/Dockerfile.centos7 \
+        -f java/ci/Dockerfile.rocky \
         --build-arg CUDA_VERSION=${cudaVersion} \
         -t $imageName .
     nvidia-docker run -it -u $(id -u):$(id -g) --rm \
diff --git a/java/ci/Dockerfile.centos7 b/java/ci/Dockerfile.centos7
deleted file mode 100644
index b2c620848de..00000000000
--- a/java/ci/Dockerfile.centos7
+++ /dev/null
@@ -1,56 +0,0 @@
-#
-# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-###
-# Build the image for cudf development environment.
-#
-# Arguments: CUDA_VERSION=11.X.Y
-#
-###
-ARG CUDA_VERSION=11.8.0
-FROM nvidia/cuda:$CUDA_VERSION-devel-centos7
-
-### Install basic requirements
-ARG DEVTOOLSET_VERSION=11
-RUN yum install -y centos-release-scl
-RUN yum install -y devtoolset-${DEVTOOLSET_VERSION} epel-release
-RUN yum install -y git zlib-devel maven tar wget patch ninja-build
-
-## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins
-RUN mkdir /usr/local/rapids && mkdir /rapids && chmod 777 /usr/local/rapids && chmod 777 /rapids
-
-ARG CMAKE_VERSION=3.26.4
-RUN cd /usr/local/ && wget --quiet https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \
-   tar zxf cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \
-   rm cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz
-
-ENV PATH /usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:$PATH
-
-ARG CCACHE_VERSION=4.6
-RUN cd /tmp && wget --quiet https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}.tar.gz && \
-   tar zxf ccache-${CCACHE_VERSION}.tar.gz && \
-   rm ccache-${CCACHE_VERSION}.tar.gz && \
-   cd ccache-${CCACHE_VERSION} && \
-   mkdir build && \
-   cd build && \
-   scl enable devtoolset-${DEVTOOLSET_VERSION} \
-      "cmake .. \
-         -DCMAKE_BUILD_TYPE=Release \
-         -DZSTD_FROM_INTERNET=ON \
-         -DREDIS_STORAGE_BACKEND=OFF && \
-      cmake --build . --parallel ${PARALLEL_LEVEL} --target install" && \
-   cd ../.. && \
-   rm -rf ccache-${CCACHE_VERSION}
diff --git a/java/ci/Dockerfile.rocky b/java/ci/Dockerfile.rocky
new file mode 100644
index 00000000000..6b87f3ed34e
--- /dev/null
+++ b/java/ci/Dockerfile.rocky
@@ -0,0 +1,62 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+###
+# Build the image for cudf development environment.
+#
+# Arguments: CUDA_VERSION=[11.X.Y, 12.X.Y], OS_RELEASE=[8, 9], TARGETPLATFORM=[linux/amd64, linux/amd64]
+#
+###
+ARG CUDA_VERSION=11.8.0
+ARG OS_RELEASE=8
+ARG TARGETPLATFORM=linux/amd64
+# multi-platform build with: docker buildx build --platform linux/arm64,linux/amd64 <ARGS> on either amd64 or arm64 host
+# check available official arm-based docker images at https://hub.docker.com/r/nvidia/cuda/tags (OS/ARCH)
+FROM --platform=$TARGETPLATFORM nvidia/cuda:$CUDA_VERSION-devel-rockylinux$OS_RELEASE
+ARG TOOLSET_VERSION=11
+### Install basic requirements
+RUN dnf --enablerepo=powertools install -y  scl-utils gcc-toolset-${TOOLSET_VERSION} git zlib-devel maven tar wget patch ninja-build
+## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins
+RUN mkdir /usr/local/rapids /rapids && chmod 777 /usr/local/rapids /rapids
+
+# 3.22.3+: CUDA architecture 'native' support + flexible CMAKE_<LANG>_*_LAUNCHER for ccache
+ARG CMAKE_VERSION=3.26.4
+# default x86_64 from x86 build, aarch64 cmake for arm build
+ARG CMAKE_ARCH=x86_64
+RUN cd /usr/local && wget --quiet https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz && \
+   tar zxf cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz && \
+   rm cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz
+ENV PATH /usr/local/cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}/bin:$PATH
+
+# ccache for interactive builds
+ARG CCACHE_VERSION=4.6
+RUN cd /tmp && wget --quiet https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}.tar.gz && \
+   tar zxf ccache-${CCACHE_VERSION}.tar.gz && \
+   rm ccache-${CCACHE_VERSION}.tar.gz && \
+   cd ccache-${CCACHE_VERSION} && \
+   mkdir build && \
+   cd build && \
+   scl enable gcc-toolset-${TOOLSET_VERSION} \
+      "cmake .. \
+         -DCMAKE_BUILD_TYPE=Release \
+         -DZSTD_FROM_INTERNET=ON \
+         -DREDIS_STORAGE_BACKEND=OFF && \
+      cmake --build . --parallel 4 --target install" && \
+   cd ../.. && \
+   rm -rf ccache-${CCACHE_VERSION}
+
+# disable cuda container constraints to allow running w/ elder drivers on data-center GPUs
+ENV NVIDIA_DISABLE_REQUIRE="true"
diff --git a/java/ci/README.md b/java/ci/README.md
index da24c5923ea..18ad3cc4d0d 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -11,14 +11,14 @@
 
 In the root path of cuDF repo, run below command to build the docker image.
 ```bash
-docker build -f java/ci/Dockerfile.centos7 --build-arg CUDA_VERSION=11.8.0 -t cudf-build:11.8.0-devel-centos7 .
+docker build -f java/ci/Dockerfile.rocky --build-arg CUDA_VERSION=11.8.0 -t cudf-build:11.8.0-devel-rocky8 .
 ```
 
 The following CUDA versions are supported w/ CUDA Enhanced Compatibility:
 * CUDA 11.0+
 
 Change the --build-arg CUDA_VERSION to what you need.
-You can replace the tag "cudf-build:11.8.0-devel-centos7" with another name you like.
+You can replace the tag "cudf-build:11.8.0-devel-rocky8" with another name you like.
 
 ## Start the docker then build
 
@@ -26,7 +26,7 @@ You can replace the tag "cudf-build:11.8.0-devel-centos7" with another name you
 
 Run below command to start a docker container with GPU.
 ```bash
-nvidia-docker run -it cudf-build:11.8.0-devel-centos7 bash
+nvidia-docker run -it cudf-build:11.8.0-devel-rocky8 bash
 ```
 
 ### Download the cuDF source code
@@ -42,7 +42,7 @@ git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.06
 ```bash
 cd cudf
 export WORKSPACE=`pwd`
-scl enable devtoolset-11 "java/ci/build-in-docker.sh"
+scl enable gcc-toolset-11 "java/ci/build-in-docker.sh"
 ```
 
 ### The output

From 2fccbc0ba4af7a76c47553ea578d517d2db8e297 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Thu, 2 May 2024 14:45:03 -0500
Subject: [PATCH 151/272] Add JSON option to prune columns (#14996)

Resolves https://github.com/rapidsai/cudf/issues/14951
This adds an option `prune_columns` to json_reader_options (default False)
When set to True, the dtypes option is used as filter instead of type inference suggestion. If dtypes (vector of dtypes, map of dtypes or nested schema), is not specified, output is empty dataframe.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - MithunR (https://github.com/mythrocks)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Shruti Shivakumar (https://github.com/shrshi)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14996
---
 cpp/include/cudf/io/json.hpp          |  40 +++++
 cpp/src/io/json/json_column.cu        | 143 ++++++++++++------
 cpp/src/io/json/nested_json.hpp       |   2 +-
 cpp/src/io/json/parser_features.cpp   |  15 +-
 cpp/tests/io/json_test.cpp            | 205 +++++++++++++++++++++++++-
 python/cudf/cudf/_lib/cpp/io/json.pxd |   5 +
 python/cudf/cudf/_lib/json.pyx        |   4 +-
 python/cudf/cudf/io/json.py           |   2 +
 python/cudf/cudf/utils/ioutils.py     |  16 +-
 9 files changed, 377 insertions(+), 55 deletions(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index a6112b8db4c..7374ffc37e6 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -101,6 +101,8 @@ class json_reader_options {
   bool _lines = false;
   // Parse mixed types as a string column
   bool _mixed_types_as_string = false;
+  // Prune columns on read, selected based on the _dtypes option
+  bool _prune_columns = false;
 
   // Bytes to skip from the start
   size_t _byte_range_offset = 0;
@@ -241,6 +243,17 @@ class json_reader_options {
    */
   bool is_enabled_mixed_types_as_string() const { return _mixed_types_as_string; }
 
+  /**
+   * @brief Whether to prune columns on read, selected based on the @ref set_dtypes option.
+   *
+   * When set as true, if the reader options include @ref set_dtypes, then
+   * the reader will only return those columns which are mentioned in @ref set_dtypes.
+   * If false, then all columns are returned, independent of the @ref set_dtypes setting.
+   *
+   * @return True if column pruning is enabled
+   */
+  bool is_enabled_prune_columns() const { return _prune_columns; }
+
   /**
    * @brief Whether to parse dates as DD/MM versus MM/DD.
    *
@@ -342,6 +355,17 @@ class json_reader_options {
    */
   void enable_mixed_types_as_string(bool val) { _mixed_types_as_string = val; }
 
+  /**
+   * @brief Set whether to prune columns on read, selected based on the @ref set_dtypes option.
+   *
+   * When set as true, if the reader options include @ref set_dtypes, then
+   * the reader will only return those columns which are mentioned in @ref set_dtypes.
+   * If false, then all columns are returned, independent of the @ref set_dtypes setting.
+   *
+   * @param val Boolean value to enable/disable column pruning
+   */
+  void enable_prune_columns(bool val) { _prune_columns = val; }
+
   /**
    * @brief Set whether to parse dates as DD/MM versus MM/DD.
    *
@@ -508,6 +532,22 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set whether to prune columns on read, selected based on the @ref dtypes option.
+   *
+   * When set as true, if the reader options include @ref dtypes, then
+   * the reader will only return those columns which are mentioned in @ref dtypes.
+   * If false, then all columns are returned, independent of the @ref dtypes setting.
+   *
+   * @param val Boolean value to enable/disable column pruning
+   * @return this for chaining
+   */
+  json_reader_options_builder& prune_columns(bool val)
+  {
+    options._prune_columns = val;
+    return *this;
+  }
+
   /**
    * @brief Set whether to parse dates as DD/MM versus MM/DD.
    *
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 7117af8948b..631f8adbd6d 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -564,7 +564,7 @@ void make_device_json_column(device_span<SymbolT const> input,
     }
   };
   auto init_to_zero = [stream](auto& v) {
-    thrust::uninitialized_fill(rmm::exec_policy(stream), v.begin(), v.end(), 0);
+    thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0);
   };
 
   auto initialize_json_columns = [&](auto i, auto& col) {
@@ -625,13 +625,14 @@ void make_device_json_column(device_span<SymbolT const> input,
   // find column_ids which are values, but should be ignored in validity
   std::vector<uint8_t> ignore_vals(num_columns, 0);
   std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
+  std::vector<uint8_t> is_pruned(num_columns, 0);
   columns.try_emplace(parent_node_sentinel, std::ref(root));
 
-  for (auto const this_col_id : unique_col_ids) {
-    if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
-      continue;
-    }
-    // Struct, List, String, Value
+  auto name_and_parent_index = [&is_array_of_arrays,
+                                &row_array_parent_col_id,
+                                &column_parent_ids,
+                                &column_categories,
+                                &column_names](auto this_col_id) {
     std::string name   = "";
     auto parent_col_id = column_parent_ids[this_col_id];
     if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
@@ -647,11 +648,46 @@ void make_device_json_column(device_span<SymbolT const> input,
     } else {
       CUDF_FAIL("Unexpected parent column category");
     }
+    return std::pair{name, parent_col_id};
+  };
+
+  // Prune columns that are not required to be parsed.
+  if (options.is_enabled_prune_columns()) {
+    for (auto const this_col_id : unique_col_ids) {
+      if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
+        continue;
+      }
+      // Struct, List, String, Value
+      auto [name, parent_col_id] = name_and_parent_index(this_col_id);
+      // get path of this column, and get its dtype if present in options
+      auto const nt                             = tree_path.get_path(this_col_id);
+      std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
+      if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) {
+        is_pruned[this_col_id] = 1;
+        continue;
+      } else {
+        // make sure all its parents are not pruned.
+        while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) {
+          is_pruned[parent_col_id] = 0;
+          parent_col_id            = column_parent_ids[parent_col_id];
+        }
+      }
+    }
+  }
+
+  // Build the column tree, also, handles mixed types.
+  for (auto const this_col_id : unique_col_ids) {
+    if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
+      continue;
+    }
+    // Struct, List, String, Value
+    auto [name, parent_col_id] = name_and_parent_index(this_col_id);
 
-    if (parent_col_id != parent_node_sentinel && is_mixed_type_column[parent_col_id] == 1) {
-      // if parent is mixed type column, ignore this column.
-      is_mixed_type_column[this_col_id] = 1;
-      ignore_vals[this_col_id]          = 1;
+    // if parent is mixed type column or this column is pruned, ignore this column.
+    if (parent_col_id != parent_node_sentinel &&
+        (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id])) {
+      ignore_vals[this_col_id] = 1;
+      if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; }
       continue;
     }
 
@@ -714,12 +750,13 @@ void make_device_json_column(device_span<SymbolT const> input,
                      "A mix of lists and structs within the same column is not supported");
       }
     }
+
     if (is_enabled_mixed_types_as_string) {
       // get path of this column, check if it is a struct forced as string, and enforce it
-      auto nt                          = tree_path.get_path(this_col_id);
-      std::optional<data_type> user_dt = get_path_data_type(nt, options);
-      if (column_categories[this_col_id] == NC_STRUCT and user_dt.has_value() and
-          user_dt.value().id() == type_id::STRING) {
+      auto const nt                             = tree_path.get_path(this_col_id);
+      std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
+      if (column_categories[this_col_id] == NC_STRUCT and user_dtype.has_value() and
+          user_dtype.value().id() == type_id::STRING) {
         is_mixed_type_column[this_col_id] = 1;
         column_categories[this_col_id]    = NC_STR;
       }
@@ -873,25 +910,27 @@ void make_device_json_column(device_span<SymbolT const> input,
   for (auto& [id, col_ref] : columns) {
     auto& col = col_ref.get();
     if (col.type == json_col_t::StringColumn) {
-      thrust::inclusive_scan(rmm::exec_policy(stream),
+      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
                              col.string_offsets.begin(),
                              col.string_offsets.end(),
                              col.string_offsets.begin(),
                              thrust::maximum<json_column::row_offset_t>{});
     } else if (col.type == json_col_t::ListColumn) {
-      thrust::inclusive_scan(rmm::exec_policy(stream),
+      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
                              col.child_offsets.begin(),
                              col.child_offsets.end(),
                              col.child_offsets.begin(),
                              thrust::maximum<json_column::row_offset_t>{});
     }
   }
+  stream.synchronize();
 }
 
 std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_column_to_cudf_column(
   device_json_column& json_col,
   device_span<SymbolT const> d_input,
   cudf::io::parse_options const& options,
+  bool prune_columns,
   std::optional<schema_element> schema,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
@@ -982,13 +1021,16 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
       for (auto const& col_name : json_col.column_order) {
         auto const& col = json_col.child_columns.find(col_name);
         column_names.emplace_back(col->first);
-        auto& child_col            = col->second;
-        auto [child_column, names] = device_json_column_to_cudf_column(
-          child_col, d_input, options, get_child_schema(col_name), stream, mr);
-        CUDF_EXPECTS(num_rows == child_column->size(),
-                     "All children columns must have the same size");
-        child_columns.push_back(std::move(child_column));
-        column_names.back().children = names;
+        auto& child_col           = col->second;
+        auto child_schema_element = get_child_schema(col_name);
+        if (!prune_columns or child_schema_element.has_value()) {
+          auto [child_column, names] = device_json_column_to_cudf_column(
+            child_col, d_input, options, prune_columns, child_schema_element, stream, mr);
+          CUDF_EXPECTS(num_rows == child_column->size(),
+                       "All children columns must have the same size");
+          child_columns.push_back(std::move(child_column));
+          column_names.back().children = names;
+        }
       }
       auto [result_bitmask, null_count] = make_validity(json_col);
       // The null_mask is set after creation of struct column is to skip the superimpose_nulls and
@@ -1011,8 +1053,11 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
                                                      rmm::device_buffer{},
                                                      0);
       // Create children column
+      auto child_schema_element = json_col.child_columns.empty()
+                                    ? std::optional<schema_element>{}
+                                    : get_child_schema(json_col.child_columns.begin()->first);
       auto [child_column, names] =
-        json_col.child_columns.empty()
+        json_col.child_columns.empty() or (prune_columns and !child_schema_element.has_value())
           ? std::pair<std::unique_ptr<column>,
                       // EMPTY type could not used because gather throws exception on EMPTY type.
                       std::vector<column_name_info>>{std::make_unique<column>(
@@ -1022,13 +1067,13 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
                                                        rmm::device_buffer{},
                                                        0),
                                                      std::vector<column_name_info>{}}
-          : device_json_column_to_cudf_column(
-              json_col.child_columns.begin()->second,
-              d_input,
-              options,
-              get_child_schema(json_col.child_columns.begin()->first),
-              stream,
-              mr);
+          : device_json_column_to_cudf_column(json_col.child_columns.begin()->second,
+                                              d_input,
+                                              options,
+                                              prune_columns,
+                                              child_schema_element,
+                                              stream,
+                                              mr);
       column_names.back().children      = names;
       auto [result_bitmask, null_count] = make_validity(json_col);
       auto ret_col                      = make_lists_column(num_rows,
@@ -1140,8 +1185,6 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
   size_type column_index = 0;
   for (auto const& col_name : root_struct_col.column_order) {
     auto& json_col = root_struct_col.child_columns.find(col_name)->second;
-    // Insert this columns name into the schema
-    out_column_names.emplace_back(col_name);
 
     std::optional<schema_element> child_schema_element = std::visit(
       cudf::detail::visitor_overload{
@@ -1184,18 +1227,28 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
     debug_schema_print(child_schema_element);
 #endif
 
-    // Get this JSON column's cudf column and schema info, (modifies json_col)
-    auto [cudf_col, col_name_info] = device_json_column_to_cudf_column(
-      json_col, d_input, parse_opt, child_schema_element, stream, mr);
-    // TODO: RangeIndex as DataFrame.columns names for array of arrays
-    // if (is_array_of_arrays) {
-    //   col_name_info.back().name = "";
-    // }
-
-    out_column_names.back().children = std::move(col_name_info);
-    out_columns.emplace_back(std::move(cudf_col));
-
-    column_index++;
+    if (!options.is_enabled_prune_columns() or child_schema_element.has_value()) {
+      // Get this JSON column's cudf column and schema info, (modifies json_col)
+      auto [cudf_col, col_name_info] =
+        device_json_column_to_cudf_column(json_col,
+                                          d_input,
+                                          parse_opt,
+                                          options.is_enabled_prune_columns(),
+                                          child_schema_element,
+                                          stream,
+                                          mr);
+      // Insert this column's name into the schema
+      out_column_names.emplace_back(col_name);
+      // TODO: RangeIndex as DataFrame.columns names for array of arrays
+      // if (is_array_of_arrays) {
+      //   col_name_info.back().name = "";
+      // }
+
+      out_column_names.back().children = std::move(col_name_info);
+      out_columns.emplace_back(std::move(cudf_col));
+
+      column_index++;
+    }
   }
 
   return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {out_column_names}};
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index a302785cee8..52ea23c7f1c 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -319,7 +319,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> input,
  * @return data type of the column if present
  */
 std::optional<data_type> get_path_data_type(
-  host_span<std::pair<std::string, cudf::io::json::NodeT>> path,
+  host_span<std::pair<std::string, cudf::io::json::NodeT> const> path,
   cudf::io::json_reader_options const& options);
 
 /**
diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp
index 740b7523cc1..4caa5cd9e24 100644
--- a/cpp/src/io/json/parser_features.cpp
+++ b/cpp/src/io/json/parser_features.cpp
@@ -58,8 +58,15 @@ std::optional<schema_element> child_schema_element(std::string const& col_name,
 // "a": [ null]         {"a", list}, {"element", str}
 // back() is root.
 // front() is leaf.
+/**
+ * @brief Get the path data type of a column by path if present in input schema
+ *
+ * @param path path of the json column
+ * @param root root of input schema element
+ * @return data type of the column if present, otherwise std::nullopt
+ */
 std::optional<data_type> get_path_data_type(
-  host_span<std::pair<std::string, cudf::io::json::NodeT>> path, schema_element const& root)
+  host_span<std::pair<std::string, cudf::io::json::NodeT> const> path, schema_element const& root)
 {
   if (path.empty() || path.size() == 1) {
     return root.type;
@@ -81,7 +88,7 @@ std::optional<data_type> get_path_data_type(
 }
 
 std::optional<data_type> get_path_data_type(
-  host_span<std::pair<std::string, cudf::io::json::NodeT>> path,
+  host_span<std::pair<std::string, cudf::io::json::NodeT> const> path,
   cudf::io::json_reader_options const& options)
 {
   if (path.empty()) return {};
@@ -98,11 +105,11 @@ std::optional<data_type> get_path_data_type(
 std::vector<path_from_tree::path_rep> path_from_tree::get_path(NodeIndexT this_col_id)
 {
   std::vector<path_rep> path;
-  // TODO Need to stop at row root. so, how to find row root?
+  // stops at root.
   while (this_col_id != parent_node_sentinel) {
     auto type        = column_categories[this_col_id];
     std::string name = "";
-    // TODO make this ifelse into a separate lambda function, along with parent_col_id.
+    // code same as name_and_parent_index lambda.
     auto parent_col_id = column_parent_ids[this_col_id];
     if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
       if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) {
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index f0f72d4e794..b25822f6613 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -2233,9 +2233,6 @@ TEST_F(JsonReaderTest, MixedTypes)
         .lines(true);
 
     cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
-    static int num_case                  = 0;
-    num_case++;
-    std::cout << "case:" << num_case << "\n";
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), expected);
   };
   // value + string (not mixed type case)
@@ -2437,4 +2434,206 @@ TEST_F(JsonReaderTest, MapTypes)
           {type_id::LIST, type_id::STRING, type_id::STRING});
 }
 
+// Test case for dtype prune:
+// all paths, only one.
+// one present, another not present, nothing present
+// nested, flat, not-jsonlines
+TEST_F(JsonReaderTest, JsonNestedDtypeFilter)
+{
+  std::string json_stringl = R"(
+    {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true}
+    {"a": 1, "b": {"0": "abc"          }, "c": false}
+    {"a": 1, "b": {}}
+    {"a": 1,                              "c": null}
+    )";
+  std::string json_string  = R"([
+    {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true},
+    {"a": 1, "b": {"0": "abc"          }, "c": false},
+    {"a": 1, "b": {}},
+    {"a": 1,                              "c": null}
+    ])";
+  for (auto& [json_string, lines] : {std::pair{json_stringl, true}, {json_string, false}}) {
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(
+        cudf::io::source_info{json_string.data(), json_string.size()})
+        .prune_columns(true)
+        .lines(lines);
+
+    // include all columns
+    //// schema
+    {
+      std::map<std::string, cudf::io::schema_element> dtype_schema{
+        {"b",
+         {data_type{cudf::type_id::STRUCT},
+          {{"0", {data_type{cudf::type_id::STRING}}},
+           {"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype<float>()}}}}}}}},
+        {"a", {dtype<int32_t>()}},
+        {"c", {dtype<bool>()}},
+      };
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have columns "a", "b" and "c"
+      ASSERT_EQ(result.tbl->num_columns(), 3);
+      ASSERT_EQ(result.metadata.schema_info.size(), 3);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+      EXPECT_EQ(result.metadata.schema_info[1].name, "b");
+      EXPECT_EQ(result.metadata.schema_info[2].name, "c");
+      // "b" children checks
+      ASSERT_EQ(result.metadata.schema_info[1].children.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[1].children[0].name, "0");
+      EXPECT_EQ(result.metadata.schema_info[1].children[1].name, "1");
+      ASSERT_EQ(result.metadata.schema_info[1].children[1].children.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[1].children[1].children[0].name, "offsets");
+      EXPECT_EQ(result.metadata.schema_info[1].children[1].children[1].name, "element");
+      // types
+      EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT32);
+      EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRUCT);
+      EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::BOOL8);
+      EXPECT_EQ(result.tbl->get_column(1).child(0).type().id(), cudf::type_id::STRING);
+      EXPECT_EQ(result.tbl->get_column(1).child(1).type().id(), cudf::type_id::LIST);
+      EXPECT_EQ(result.tbl->get_column(1).child(1).child(0).type().id(), cudf::type_id::INT32);
+      EXPECT_EQ(result.tbl->get_column(1).child(1).child(1).type().id(), cudf::type_id::FLOAT32);
+    }
+    //// vector
+    {
+      std::vector<data_type> types{
+        {dtype<int32_t>()}, data_type{cudf::type_id::STRUCT}, {dtype<bool>()}};
+      in_options.set_dtypes(types);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have columns "a", "b" and "c"
+      ASSERT_EQ(result.tbl->num_columns(), 3);
+      ASSERT_EQ(result.metadata.schema_info.size(), 3);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+      EXPECT_EQ(result.metadata.schema_info[1].name, "b");
+      EXPECT_EQ(result.metadata.schema_info[2].name, "c");
+    }
+    //// map
+    {
+      std::map<std::string, data_type> dtype_map{
+        {"b",
+         {
+           data_type{cudf::type_id::STRUCT},
+         }},
+        {"a", {dtype<int32_t>()}},
+        {"c", {dtype<bool>()}},
+      };
+      in_options.set_dtypes(dtype_map);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have columns "a", "b" and "c"
+      ASSERT_EQ(result.tbl->num_columns(), 3);
+      ASSERT_EQ(result.metadata.schema_info.size(), 3);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+      EXPECT_EQ(result.metadata.schema_info[1].name, "b");
+      EXPECT_EQ(result.metadata.schema_info[2].name, "c");
+    }
+
+    // include only one column
+    //// schema
+    {
+      std::map<std::string, cudf::io::schema_element> dtype_schema{
+        {"a", {dtype<int32_t>()}},
+      };
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have column "a"
+      ASSERT_EQ(result.tbl->num_columns(), 1);
+      ASSERT_EQ(result.metadata.schema_info.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+    }
+    //// vector
+    {
+      std::vector<data_type> types{{dtype<int32_t>()}};
+      in_options.set_dtypes(types);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have column "a"
+      ASSERT_EQ(result.tbl->num_columns(), 1);
+      ASSERT_EQ(result.metadata.schema_info.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+    }
+    //// map
+    {
+      std::map<std::string, data_type> dtype_map{
+        {"a", {dtype<int32_t>()}},
+      };
+      in_options.set_dtypes(dtype_map);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have column "a"
+      ASSERT_EQ(result.tbl->num_columns(), 1);
+      ASSERT_EQ(result.metadata.schema_info.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+    }
+
+    // include only one column (nested)
+    {
+      std::map<std::string, cudf::io::schema_element> dtype_schema{
+        {"b",
+         {data_type{cudf::type_id::STRUCT},
+          {{"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype<float>()}}}}}}}},
+      };
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have column "b":"1":[float]
+      ASSERT_EQ(result.tbl->num_columns(), 1);
+      ASSERT_EQ(result.metadata.schema_info.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "b");
+      ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "1");
+      ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets");
+      EXPECT_EQ(result.metadata.schema_info[0].children[0].children[1].name, "element");
+      EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
+      EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::LIST);
+      EXPECT_EQ(result.tbl->get_column(0).child(0).child(0).type().id(), cudf::type_id::INT32);
+      EXPECT_EQ(result.tbl->get_column(0).child(0).child(1).type().id(), cudf::type_id::FLOAT32);
+    }
+    // multiple - all present
+    {
+      std::map<std::string, cudf::io::schema_element> dtype_schema{
+        {"a", {dtype<int32_t>()}},
+        {"c", {dtype<bool>()}},
+      };
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have columns "a", and "c"
+      ASSERT_EQ(result.tbl->num_columns(), 2);
+      ASSERT_EQ(result.metadata.schema_info.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+      EXPECT_EQ(result.metadata.schema_info[1].name, "c");
+    }
+    // multiple - not all present
+    {
+      std::map<std::string, cudf::io::schema_element> dtype_schema{
+        {"a", {dtype<int32_t>()}},
+        {"d", {dtype<bool>()}},
+      };
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have column "a"
+      ASSERT_EQ(result.tbl->num_columns(), 1);
+      ASSERT_EQ(result.metadata.schema_info.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+    }
+    // multiple - not all present nested
+    {
+      std::map<std::string, cudf::io::schema_element> dtype_schema{
+
+        {"b",
+         {data_type{cudf::type_id::STRUCT},
+          {
+            {"2", {data_type{cudf::type_id::STRING}}},
+          }}},
+        {"c", {dtype<bool>()}},
+      };
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have columns "b" (empty struct) and "c"
+      ASSERT_EQ(result.tbl->num_columns(), 2);
+      ASSERT_EQ(result.metadata.schema_info.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "b");
+      ASSERT_EQ(result.metadata.schema_info[0].children.size(), 0);
+      EXPECT_EQ(result.metadata.schema_info[1].name, "c");
+    }
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd
index b916c2b7ad9..1e1057beede 100644
--- a/python/cudf/cudf/_lib/cpp/io/json.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/json.pxd
@@ -28,6 +28,7 @@ cdef extern from "cudf/io/json.hpp" \
         size_type get_byte_range_size() except +
         bool is_enabled_lines() except +
         bool is_enabled_mixed_types_as_string() except +
+        bool is_enabled_prune_columns() except +
         bool is_enabled_dayfirst() except +
         bool is_enabled_experimental() except +
 
@@ -41,6 +42,7 @@ cdef extern from "cudf/io/json.hpp" \
         void set_byte_range_size(size_type size) except +
         void enable_lines(bool val) except +
         void enable_mixed_types_as_string(bool val) except +
+        void enable_prune_columns(bool val) except +
         void enable_dayfirst(bool val) except +
         void enable_experimental(bool val) except +
         void enable_keep_quotes(bool val) except +
@@ -79,6 +81,9 @@ cdef extern from "cudf/io/json.hpp" \
         json_reader_options_builder& mixed_types_as_string(
             bool val
         ) except +
+        json_reader_options_builder& prune_columns(
+            bool val
+        ) except +
         json_reader_options_builder& dayfirst(
             bool val
         ) except +
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index f2e03391f08..cef71ed24a5 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -49,7 +49,8 @@ cpdef read_json(object filepaths_or_buffers,
                 object byte_range,
                 bool legacy,
                 bool keep_quotes,
-                bool mixed_types_as_string):
+                bool mixed_types_as_string,
+                bool prune_columns):
     """
     Cython function to call into libcudf API, see `read_json`.
 
@@ -128,6 +129,7 @@ cpdef read_json(object filepaths_or_buffers,
 
     opts.enable_keep_quotes(keep_quotes)
     opts.enable_mixed_types_as_string(mixed_types_as_string)
+    opts.enable_prune_columns(prune_columns)
     # Read JSON
     cdef cudf_io_types.table_with_metadata c_result
 
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index 5ef25a99590..03d07fc3a50 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -26,6 +26,7 @@ def read_json(
     keep_quotes=False,
     storage_options=None,
     mixed_types_as_string=False,
+    prune_columns=False,
     *args,
     **kwargs,
 ):
@@ -101,6 +102,7 @@ def read_json(
             False,
             keep_quotes,
             mixed_types_as_string,
+            prune_columns,
         )
     else:
         warnings.warn(
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 66e14f4b9de..6bd7558d322 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -692,7 +692,6 @@
 
        This parameter is only supported with ``engine='cudf'``.
 
-    This parameter is only supported in ``cudf`` engine.
     If `True`, any string values are read literally (and wrapped in an
     additional set of quotes).
     If `False` string values are parsed into Python strings.
@@ -703,7 +702,22 @@
     For other URLs (e.g. starting with "s3://", and "gcs://") the key-value
     pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and
     ``urllib`` for more details.
+mixed_types_as_string : bool, default False
 
+    .. admonition:: GPU-accelerated feature
+
+       This parameter is only supported with ``engine='cudf'``.
+
+    If True, mixed type columns are returned as string columns.
+    If `False` parsing mixed type columns will thrown an error.
+prune_columns : bool, default False
+
+    .. admonition:: GPU-accelerated feature
+
+       This parameter is only supported with ``engine='cudf'``.
+
+    If True, only return those columns mentioned in the dtype argument.
+    If `False` dtype argument is used a type inference suggestion.
 Returns
 -------
 result : Series or DataFrame, depending on the value of `typ`.

From 541b53a97eeb2c8bc14a834b517b6f7f81c76328 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Thu, 2 May 2024 13:23:28 -0700
Subject: [PATCH 152/272] Fix operator precedence problem in Parquet reader
 (#15638)

Fixes an operator precedence problem with a bitwise `&` that was not detected because it was accidentally correct. `PAGEINFO_FLAGS_DICTIONARY` has a value of '1', so `PAGEINFO_FLAGS_DICTIONARY != 0` evaluates to '1', and that ANDed with the page flags evaluates `true` when the bit is set.

Authors:
  - Ed Seidl (https://github.com/etseidl)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15638
---
 cpp/src/io/parquet/page_decode.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 0c139fced24..4c811449c70 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -122,7 +122,7 @@ struct null_count_back_copier {
  */
 constexpr bool is_string_col(PageInfo const& page, device_span<ColumnChunkDesc const> chunks)
 {
-  if (page.flags & PAGEINFO_FLAGS_DICTIONARY != 0) { return false; }
+  if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0) { return false; }
   auto const& col = chunks[page.chunk_idx];
   return is_string_col(col);
 }

From e3ea5237d5e139ec93d5c4cb3d06fb38df6e562b Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 2 May 2024 15:58:03 -0500
Subject: [PATCH 153/272] Fix -Werror=type-limits. (#15635)

I'm compiling cuDF as a part of another application and ran into errors from `-Werror=type-limits`. There are a few comparisons between unsigned types like `value < 0`, which is never true. This PR removes those impossible code paths.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/15635
---
 cpp/src/io/comp/cpu_unbz2.cpp              | 6 +++---
 cpp/src/io/parquet/reader_impl_helpers.cpp | 2 --
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/comp/cpu_unbz2.cpp b/cpp/src/io/comp/cpu_unbz2.cpp
index a116335b254..44535cff589 100644
--- a/cpp/src/io/comp/cpu_unbz2.cpp
+++ b/cpp/src/io/comp/cpu_unbz2.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -221,7 +221,7 @@ int32_t bz2_decompress_block(unbz_state_s* s)
   if (getbits(s, 1)) return BZ_DATA_ERROR;  // blockRandomized not supported (old bzip versions)
 
   s->origPtr = getbits(s, 24);
-  if (s->origPtr < 0 || s->origPtr > 10 + 100000 * s->blockSize100k) return BZ_DATA_ERROR;
+  if (s->origPtr > 10 + 100000 * s->blockSize100k) return BZ_DATA_ERROR;
 
   // Receive the mapping table
   inUse16 = getbits(s, 16);
@@ -436,7 +436,7 @@ int32_t bz2_decompress_block(unbz_state_s* s)
   }
 
   // Now we know what nblock is, we can do a better sanity check on s->origPtr.
-  if (s->origPtr < 0 || s->origPtr >= nblock) return BZ_DATA_ERROR;
+  if (s->origPtr >= nblock) return BZ_DATA_ERROR;
 
   // compute the T^(-1) vector
   {
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 402ccef7a15..c7659be1adb 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -205,7 +205,6 @@ void metadata::sanitize_schema()
   // This code attempts to make this less messy for the code that follows.
 
   std::function<void(size_t)> process = [&](size_t schema_idx) -> void {
-    if (schema_idx < 0) { return; }
     auto& schema_elem = schema[schema_idx];
     if (schema_idx != 0 && schema_elem.type == UNDEFINED_TYPE) {
       auto const parent_type = schema[schema_elem.parent_idx].converted_type;
@@ -723,7 +722,6 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
                        int schema_idx,
                        std::vector<cudf::io::detail::inline_column_buffer>& out_col_array,
                        bool has_list_parent) {
-      if (schema_idx < 0) { return false; }
       auto const& schema_elem = get_schema(schema_idx);
 
       // if schema_elem is a stub then it does not exist in the column_name_info and column_buffer

From 81f8cdfdfb326afaee8177e4f40a607393b21b99 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Thu, 2 May 2024 16:23:18 -0700
Subject: [PATCH 154/272] Implement ORC chunked reader (#15094)

This implements ORC chunked reader, to support reading ORC such that:
 * The output is multiple tables instead of once, each of them is issue when calling to `read_chunk()`, and has limited size which stays within a given `output_limit` parameter.
 * The temporary device memory usage can be limited by a soft limit `data_read_limit` parameter, allowing to read very large ORC files without OOM.
 * ORC files containing many billions of rows can be properly read chunk-by-chunk without seeing the size overflow issue when the number of rows exceeds cudf size limit (`2^31` rows).

Depends on:
 * https://github.com/rapidsai/cudf/pull/14911
 * https://github.com/rapidsai/cudf/pull/15008
 * https://github.com/rapidsai/cudf/pull/15169
 * https://github.com/rapidsai/cudf/pull/15252

Partially contribute to https://github.com/rapidsai/cudf/issues/12228.

---

## Benchmarks

Due to some small optimizations in ORC reader, reading ORC files all-at-once (reading the entire file into just one output table) can be a little bit faster. For example, with the benchmark `orc_read_io_compression`:
```
## [0] Quadro RTX 6000

|      io       |  compression  |  cardinality  |  run_length  |   Ref Time |   Ref Noise |   Cmp Time |   Cmp Noise |          Diff |   %Diff |  Status  |
|---------------|---------------|---------------|--------------|------------|-------------|------------|-------------|---------------|---------|----------|
|   FILEPATH    |    SNAPPY     |       0       |      1       | 183.027 ms |       7.45% | 157.293 ms |       4.72% | -25733.837 us | -14.06% |   FAIL   |
|   FILEPATH    |    SNAPPY     |     1000      |      1       | 198.228 ms |       6.43% | 164.395 ms |       4.14% | -33833.020 us | -17.07% |   FAIL   |
|   FILEPATH    |    SNAPPY     |       0       |      32      |  96.676 ms |       6.19% |  82.522 ms |       1.36% | -14153.945 us | -14.64% |   FAIL   |
|   FILEPATH    |    SNAPPY     |     1000      |      32      |  94.508 ms |       4.80% |  81.078 ms |       0.48% | -13429.672 us | -14.21% |   FAIL   |
|   FILEPATH    |     NONE      |       0       |      1       | 161.868 ms |       5.40% | 139.849 ms |       2.44% | -22018.910 us | -13.60% |   FAIL   |
|   FILEPATH    |     NONE      |     1000      |      1       | 164.902 ms |       5.80% | 142.041 ms |       3.43% | -22861.258 us | -13.86% |   FAIL   |
|   FILEPATH    |     NONE      |       0       |      32      |  88.298 ms |       5.15% |  74.924 ms |       1.97% | -13374.607 us | -15.15% |   FAIL   |
|   FILEPATH    |     NONE      |     1000      |      32      |  87.147 ms |       5.61% |  72.502 ms |       0.50% | -14645.122 us | -16.81% |   FAIL   |
|  HOST_BUFFER  |    SNAPPY     |       0       |      1       | 124.990 ms |       0.39% | 111.670 ms |       2.13% | -13320.483 us | -10.66% |   FAIL   |
|  HOST_BUFFER  |    SNAPPY     |     1000      |      1       | 149.858 ms |       4.10% | 126.266 ms |       0.48% | -23591.543 us | -15.74% |   FAIL   |
|  HOST_BUFFER  |    SNAPPY     |       0       |      32      |  92.499 ms |       4.46% |  77.653 ms |       1.58% | -14846.471 us | -16.05% |   FAIL   |
|  HOST_BUFFER  |    SNAPPY     |     1000      |      32      |  93.373 ms |       4.14% |  80.033 ms |       3.19% | -13340.002 us | -14.29% |   FAIL   |
|  HOST_BUFFER  |     NONE      |       0       |      1       | 111.792 ms |       0.50% |  97.083 ms |       0.50% | -14709.530 us | -13.16% |   FAIL   |
|  HOST_BUFFER  |     NONE      |     1000      |      1       | 117.646 ms |       5.60% |  97.634 ms |       0.44% | -20012.301 us | -17.01% |   FAIL   |
|  HOST_BUFFER  |     NONE      |       0       |      32      |  84.983 ms |       4.96% |  66.975 ms |       0.50% | -18007.403 us | -21.19% |   FAIL   |
|  HOST_BUFFER  |     NONE      |     1000      |      32      |  82.648 ms |       4.42% |  65.510 ms |       0.91% | -17137.910 us | -20.74% |   FAIL   |
| DEVICE_BUFFER |    SNAPPY     |       0       |      1       |  65.538 ms |       4.02% |  59.399 ms |       2.54% |  -6138.560 us |  -9.37% |   FAIL   |
| DEVICE_BUFFER |    SNAPPY     |     1000      |      1       | 101.427 ms |       4.10% |  92.276 ms |       3.30% |  -9150.278 us |  -9.02% |   FAIL   |
| DEVICE_BUFFER |    SNAPPY     |       0       |      32      |  80.133 ms |       4.64% |  73.959 ms |       3.50% |  -6173.818 us |  -7.70% |   FAIL   |
| DEVICE_BUFFER |    SNAPPY     |     1000      |      32      |  86.232 ms |       4.71% |  77.446 ms |       3.32% |  -8786.606 us | -10.19% |   FAIL   |
| DEVICE_BUFFER |     NONE      |       0       |      1       |  52.189 ms |       6.62% |  45.018 ms |       4.11% |  -7171.043 us | -13.74% |   FAIL   |
| DEVICE_BUFFER |     NONE      |     1000      |      1       |  54.664 ms |       6.76% |  46.855 ms |       3.35% |  -7809.803 us | -14.29% |   FAIL   |
| DEVICE_BUFFER |     NONE      |       0       |      32      |  67.975 ms |       5.12% |  60.553 ms |       4.22% |  -7422.279 us | -10.92% |   FAIL   |
| DEVICE_BUFFER |     NONE      |     1000      |      32      |  68.485 ms |       4.86% |  62.253 ms |       6.23% |  -6232.340 us |  -9.10% |   FAIL   |

```


When memory is limited, chunked read can help avoiding OOM but with some sort of performance trade-off. For example, for reading a table of size 500MB from file using 64MB output limits and 640 MB data read limit:
```
|      io       |  compression  |  cardinality  |  run_length  |   Ref Time |   Ref Noise |   Cmp Time |   Cmp Noise |       Diff |   %Diff |  Status  |
|---------------|---------------|---------------|--------------|------------|-------------|------------|-------------|------------|---------|----------|
|   FILEPATH    |    SNAPPY     |       0       |      1       | 183.027 ms |       7.45% | 350.824 ms |       2.74% | 167.796 ms |  91.68% |   FAIL   |
|   FILEPATH    |    SNAPPY     |     1000      |      1       | 198.228 ms |       6.43% | 322.414 ms |       3.46% | 124.186 ms |  62.65% |   FAIL   |
|   FILEPATH    |    SNAPPY     |       0       |      32      |  96.676 ms |       6.19% | 133.363 ms |       4.78% |  36.686 ms |  37.95% |   FAIL   |
|   FILEPATH    |    SNAPPY     |     1000      |      32      |  94.508 ms |       4.80% | 128.897 ms |       0.37% |  34.389 ms |  36.39% |   FAIL   |
|   FILEPATH    |     NONE      |       0       |      1       | 161.868 ms |       5.40% | 316.637 ms |       4.21% | 154.769 ms |  95.61% |   FAIL   |
|   FILEPATH    |     NONE      |     1000      |      1       | 164.902 ms |       5.80% | 326.043 ms |       3.06% | 161.141 ms |  97.72% |   FAIL   |
|   FILEPATH    |     NONE      |       0       |      32      |  88.298 ms |       5.15% | 124.819 ms |       5.17% |  36.520 ms |  41.36% |   FAIL   |
|   FILEPATH    |     NONE      |     1000      |      32      |  87.147 ms |       5.61% | 123.047 ms |       5.82% |  35.900 ms |  41.19% |   FAIL   |
|  HOST_BUFFER  |    SNAPPY     |       0       |      1       | 124.990 ms |       0.39% | 285.718 ms |       0.78% | 160.728 ms | 128.59% |   FAIL   |
|  HOST_BUFFER  |    SNAPPY     |     1000      |      1       | 149.858 ms |       4.10% | 263.491 ms |       2.89% | 113.633 ms |  75.83% |   FAIL   |
|  HOST_BUFFER  |    SNAPPY     |       0       |      32      |  92.499 ms |       4.46% | 127.881 ms |       0.86% |  35.382 ms |  38.25% |   FAIL   |
|  HOST_BUFFER  |    SNAPPY     |     1000      |      32      |  93.373 ms |       4.14% | 128.022 ms |       0.98% |  34.650 ms |  37.11% |   FAIL   |
|  HOST_BUFFER  |     NONE      |       0       |      1       | 111.792 ms |       0.50% | 241.064 ms |       1.89% | 129.271 ms | 115.64% |   FAIL   |
|  HOST_BUFFER  |     NONE      |     1000      |      1       | 117.646 ms |       5.60% | 248.134 ms |       3.08% | 130.488 ms | 110.92% |   FAIL   |
|  HOST_BUFFER  |     NONE      |       0       |      32      |  84.983 ms |       4.96% | 118.049 ms |       5.99% |  33.066 ms |  38.91% |   FAIL   |
|  HOST_BUFFER  |     NONE      |     1000      |      32      |  82.648 ms |       4.42% | 114.577 ms |       2.34% |  31.929 ms |  38.63% |   FAIL   |
| DEVICE_BUFFER |    SNAPPY     |       0       |      1       |  65.538 ms |       4.02% | 232.466 ms |       3.28% | 166.928 ms | 254.71% |   FAIL   |
| DEVICE_BUFFER |    SNAPPY     |     1000      |      1       | 101.427 ms |       4.10% | 221.578 ms |       1.43% | 120.152 ms | 118.46% |   FAIL   |
| DEVICE_BUFFER |    SNAPPY     |       0       |      32      |  80.133 ms |       4.64% | 120.604 ms |       0.35% |  40.471 ms |  50.50% |   FAIL   |
| DEVICE_BUFFER |    SNAPPY     |     1000      |      32      |  86.232 ms |       4.71% | 125.521 ms |       3.93% |  39.289 ms |  45.56% |   FAIL   |
| DEVICE_BUFFER |     NONE      |       0       |      1       |  52.189 ms |       6.62% | 182.943 ms |       0.29% | 130.754 ms | 250.54% |   FAIL   |
| DEVICE_BUFFER |     NONE      |     1000      |      1       |  54.664 ms |       6.76% | 190.501 ms |       0.49% | 135.836 ms | 248.49% |   FAIL   |
| DEVICE_BUFFER |     NONE      |       0       |      32      |  67.975 ms |       5.12% | 107.172 ms |       3.56% |  39.197 ms |  57.66% |   FAIL   |
| DEVICE_BUFFER |     NONE      |     1000      |      32      |  68.485 ms |       4.86% | 108.097 ms |       2.92% |  39.611 ms |  57.84% |   FAIL   |

```
And if memory is too limited, chunked read with 8MB output limit/80MB data read limit:
```
|      io       |  compression  |  cardinality  |  run_length  |   Ref Time |   Ref Noise |   Cmp Time |   Cmp Noise |       Diff |   %Diff |  Status  |
|      io       |  compression  |  cardinality  |  run_length  |   Ref Time |   Ref Noise |   Cmp Time |   Cmp Noise |       Diff |   %Diff |  Status  |
|---------------|---------------|---------------|--------------|------------|-------------|------------|-------------|------------|---------|----------|
|   FILEPATH    |    SNAPPY     |       0       |      1       | 183.027 ms |       7.45% | 732.926 ms |       1.98% | 549.899 ms | 300.45% |   FAIL   |
|   FILEPATH    |    SNAPPY     |     1000      |      1       | 198.228 ms |       6.43% | 834.309 ms |       4.21% | 636.081 ms | 320.88% |   FAIL   |
|   FILEPATH    |    SNAPPY     |       0       |      32      |  96.676 ms |       6.19% | 363.033 ms |       1.66% | 266.356 ms | 275.51% |   FAIL   |
|   FILEPATH    |    SNAPPY     |     1000      |      32      |  94.508 ms |       4.80% | 313.813 ms |       1.28% | 219.305 ms | 232.05% |   FAIL   |
|   FILEPATH    |     NONE      |       0       |      1       | 161.868 ms |       5.40% | 607.700 ms |       2.90% | 445.832 ms | 275.43% |   FAIL   |
|   FILEPATH    |     NONE      |     1000      |      1       | 164.902 ms |       5.80% | 616.101 ms |       3.46% | 451.199 ms | 273.62% |   FAIL   |
|   FILEPATH    |     NONE      |       0       |      32      |  88.298 ms |       5.15% | 267.703 ms |       0.46% | 179.405 ms | 203.18% |   FAIL   |
|   FILEPATH    |     NONE      |     1000      |      32      |  87.147 ms |       5.61% | 250.528 ms |       0.43% | 163.381 ms | 187.48% |   FAIL   |
|  HOST_BUFFER  |    SNAPPY     |       0       |      1       | 124.990 ms |       0.39% | 636.270 ms |       0.44% | 511.280 ms | 409.06% |   FAIL   |
|  HOST_BUFFER  |    SNAPPY     |     1000      |      1       | 149.858 ms |       4.10% | 747.264 ms |       0.50% | 597.406 ms | 398.65% |   FAIL   |
|  HOST_BUFFER  |    SNAPPY     |       0       |      32      |  92.499 ms |       4.46% | 359.660 ms |       0.19% | 267.161 ms | 288.82% |   FAIL   |
|  HOST_BUFFER  |    SNAPPY     |     1000      |      32      |  93.373 ms |       4.14% | 311.608 ms |       0.43% | 218.235 ms | 233.73% |   FAIL   |
|  HOST_BUFFER  |     NONE      |       0       |      1       | 111.792 ms |       0.50% | 493.797 ms |       0.13% | 382.005 ms | 341.71% |   FAIL   |
|  HOST_BUFFER  |     NONE      |     1000      |      1       | 117.646 ms |       5.60% | 516.706 ms |       0.12% | 399.060 ms | 339.20% |   FAIL   |
|  HOST_BUFFER  |     NONE      |       0       |      32      |  84.983 ms |       4.96% | 258.477 ms |       0.46% | 173.495 ms | 204.15% |   FAIL   |
|  HOST_BUFFER  |     NONE      |     1000      |      32      |  82.648 ms |       4.42% | 248.028 ms |       5.30% | 165.380 ms | 200.10% |   FAIL   |
| DEVICE_BUFFER |    SNAPPY     |       0       |      1       |  65.538 ms |       4.02% | 606.010 ms |       3.76% | 540.472 ms | 824.68% |   FAIL   |
| DEVICE_BUFFER |    SNAPPY     |     1000      |      1       | 101.427 ms |       4.10% | 742.774 ms |       4.64% | 641.347 ms | 632.33% |   FAIL   |
| DEVICE_BUFFER |    SNAPPY     |       0       |      32      |  80.133 ms |       4.64% | 364.701 ms |       2.70% | 284.568 ms | 355.12% |   FAIL   |
| DEVICE_BUFFER |    SNAPPY     |     1000      |      32      |  86.232 ms |       4.71% | 320.387 ms |       2.80% | 234.155 ms | 271.54% |   FAIL   |
| DEVICE_BUFFER |     NONE      |       0       |      1       |  52.189 ms |       6.62% | 458.100 ms |       2.15% | 405.912 ms | 777.78% |   FAIL   |
| DEVICE_BUFFER |     NONE      |     1000      |      1       |  54.664 ms |       6.76% | 478.527 ms |       1.41% | 423.862 ms | 775.39% |   FAIL   |
| DEVICE_BUFFER |     NONE      |       0       |      32      |  67.975 ms |       5.12% | 260.009 ms |       3.71% | 192.034 ms | 282.51% |   FAIL   |
| DEVICE_BUFFER |     NONE      |     1000      |      32      |  68.485 ms |       4.86% | 243.705 ms |       2.09% | 175.220 ms | 255.85% |   FAIL   |

```

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - https://github.com/nvdbaranec
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15094
---
 cpp/CMakeLists.txt                            |    3 +-
 cpp/benchmarks/io/orc/orc_reader_input.cpp    |  106 +-
 cpp/include/cudf/io/detail/orc.hpp            |   64 +-
 cpp/include/cudf/io/orc.hpp                   |  160 +-
 cpp/src/io/functions.cpp                      |   60 +-
 cpp/src/io/orc/aggregate_orc_metadata.cpp     |   71 +-
 cpp/src/io/orc/aggregate_orc_metadata.hpp     |   22 +-
 cpp/src/io/orc/orc.hpp                        |    8 +-
 cpp/src/io/orc/reader_impl.cu                 |  255 ++-
 cpp/src/io/orc/reader_impl.hpp                |  164 +-
 cpp/src/io/orc/reader_impl_chunking.cu        |  723 ++++++++
 cpp/src/io/orc/reader_impl_chunking.hpp       |  290 +++-
 ...pl_preprocess.cu => reader_impl_decode.cu} |  851 +++++-----
 cpp/src/io/orc/reader_impl_helpers.hpp        |    4 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp    |    5 +-
 cpp/src/io/utilities/row_selection.cpp        |   21 +-
 cpp/src/io/utilities/row_selection.hpp        |    5 +-
 cpp/tests/CMakeLists.txt                      |    2 +-
 cpp/tests/io/orc_chunked_reader_test.cu       | 1477 +++++++++++++++++
 cpp/tests/io/row_selection_test.cpp           |   13 -
 python/cudf/cudf/_lib/cpp/io/orc.pxd          |   16 +-
 python/cudf/cudf/_lib/orc.pyx                 |    6 +-
 22 files changed, 3685 insertions(+), 641 deletions(-)
 create mode 100644 cpp/src/io/orc/reader_impl_chunking.cu
 rename cpp/src/io/orc/{reader_impl_preprocess.cu => reader_impl_decode.cu} (56%)
 create mode 100644 cpp/tests/io/orc_chunked_reader_test.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 53da710f0ea..232a4f40d8e 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -395,8 +395,9 @@ add_library(
   src/io/orc/dict_enc.cu
   src/io/orc/orc.cpp
   src/io/orc/reader_impl.cu
+  src/io/orc/reader_impl_chunking.cu
+  src/io/orc/reader_impl_decode.cu
   src/io/orc/reader_impl_helpers.cpp
-  src/io/orc/reader_impl_preprocess.cu
   src/io/orc/stats_enc.cu
   src/io/orc/stripe_data.cu
   src/io/orc/stripe_enc.cu
diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp
index fdb7dbe59b8..b7c214a8374 100644
--- a/cpp/benchmarks/io/orc/orc_reader_input.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,31 +24,59 @@
 
 #include <nvbench/nvbench.cuh>
 
+namespace {
+
 // Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
 // run on most GPUs, but large enough to allow highest throughput
-constexpr int64_t data_size        = 512 << 20;
 constexpr cudf::size_type num_cols = 64;
+constexpr std::size_t data_size    = 512 << 20;
+constexpr std::size_t Mbytes       = 1024 * 1024;
 
+template <bool is_chunked_read>
 void orc_read_common(cudf::size_type num_rows_to_read,
                      cuio_source_sink_pair& source_sink,
                      nvbench::state& state)
 {
-  cudf::io::orc_reader_options read_opts =
-    cudf::io::orc_reader_options::builder(source_sink.make_source_info());
+  auto const read_opts =
+    cudf::io::orc_reader_options::builder(source_sink.make_source_info()).build();
 
   auto mem_stats_logger = cudf::memory_stats_logger();  // init stats logger
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
-  state.exec(
-    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
-      try_drop_l3_cache();
-
-      timer.start();
-      auto const result = cudf::io::read_orc(read_opts);
-      timer.stop();
 
-      CUDF_EXPECTS(result.tbl->num_columns() == num_cols, "Unexpected number of columns");
-      CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows");
-    });
+  if constexpr (is_chunked_read) {
+    state.exec(
+      nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch&, auto& timer) {
+        try_drop_l3_cache();
+        auto const output_limit_MB =
+          static_cast<std::size_t>(state.get_int64("chunk_read_limit_MB"));
+        auto const read_limit_MB = static_cast<std::size_t>(state.get_int64("pass_read_limit_MB"));
+
+        auto reader =
+          cudf::io::chunked_orc_reader(output_limit_MB * Mbytes, read_limit_MB * Mbytes, read_opts);
+        cudf::size_type num_rows{0};
+
+        timer.start();
+        do {
+          auto chunk = reader.read_chunk();
+          num_rows += chunk.tbl->num_rows();
+        } while (reader.has_next());
+        timer.stop();
+
+        CUDF_EXPECTS(num_rows == num_rows_to_read, "Unexpected number of rows");
+      });
+  } else {  // not is_chunked_read
+    state.exec(
+      nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch&, auto& timer) {
+        try_drop_l3_cache();
+
+        timer.start();
+        auto const result = cudf::io::read_orc(read_opts);
+        timer.stop();
+
+        CUDF_EXPECTS(result.tbl->num_columns() == num_cols, "Unexpected number of columns");
+        CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows");
+      });
+  }
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
   state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
@@ -57,6 +85,8 @@ void orc_read_common(cudf::size_type num_rows_to_read,
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
 }
 
+}  // namespace
+
 template <data_type DataType, cudf::io::io_type IOType>
 void BM_orc_read_data(nvbench::state& state,
                       nvbench::type_list<nvbench::enum_type<DataType>, nvbench::enum_type<IOType>>)
@@ -79,13 +109,11 @@ void BM_orc_read_data(nvbench::state& state,
     return view.num_rows();
   }();
 
-  orc_read_common(num_rows_written, source_sink, state);
+  orc_read_common<false>(num_rows_written, source_sink, state);
 }
 
-template <cudf::io::io_type IOType, cudf::io::compression_type Compression>
-void BM_orc_read_io_compression(
-  nvbench::state& state,
-  nvbench::type_list<nvbench::enum_type<IOType>, nvbench::enum_type<Compression>>)
+template <cudf::io::io_type IOType, cudf::io::compression_type Compression, bool chunked_read>
+void orc_read_io_compression(nvbench::state& state)
 {
   auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL_SIGNED),
                                          static_cast<int32_t>(data_type::FLOAT),
@@ -95,15 +123,21 @@ void BM_orc_read_io_compression(
                                          static_cast<int32_t>(data_type::LIST),
                                          static_cast<int32_t>(data_type::STRUCT)});
 
-  cudf::size_type const cardinality = state.get_int64("cardinality");
-  cudf::size_type const run_length  = state.get_int64("run_length");
+  auto const [cardinality, run_length] = [&]() -> std::pair<cudf::size_type, cudf::size_type> {
+    if constexpr (chunked_read) {
+      return {0, 4};
+    } else {
+      return {static_cast<cudf::size_type>(state.get_int64("cardinality")),
+              static_cast<cudf::size_type>(state.get_int64("run_length"))};
+    }
+  }();
   cuio_source_sink_pair source_sink(IOType);
 
   auto const num_rows_written = [&]() {
     auto const tbl = create_random_table(
       cycle_dtypes(d_type, num_cols),
       table_size_bytes{data_size},
-      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+      data_profile_builder{}.cardinality(cardinality).avg_run_length(run_length));
     auto const view = tbl->view();
 
     cudf::io::orc_writer_options opts =
@@ -113,7 +147,23 @@ void BM_orc_read_io_compression(
     return view.num_rows();
   }();
 
-  orc_read_common(num_rows_written, source_sink, state);
+  orc_read_common<chunked_read>(num_rows_written, source_sink, state);
+}
+
+template <cudf::io::io_type IOType, cudf::io::compression_type Compression>
+void BM_orc_read_io_compression(
+  nvbench::state& state,
+  nvbench::type_list<nvbench::enum_type<IOType>, nvbench::enum_type<Compression>>)
+{
+  return orc_read_io_compression<IOType, Compression, false>(state);
+}
+
+template <cudf::io::compression_type Compression>
+void BM_orc_chunked_read_io_compression(nvbench::state& state,
+                                        nvbench::type_list<nvbench::enum_type<Compression>>)
+{
+  // Only run benchmark using HOST_BUFFER IO.
+  return orc_read_io_compression<cudf::io::io_type::HOST_BUFFER, Compression, true>(state);
 }
 
 using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL_SIGNED,
@@ -146,3 +196,13 @@ NVBENCH_BENCH_TYPES(BM_orc_read_io_compression, NVBENCH_TYPE_AXES(io_list, compr
   .set_min_samples(4)
   .add_int64_axis("cardinality", {0, 1000})
   .add_int64_axis("run_length", {1, 32});
+
+// Should have the same parameters as `BM_orc_read_io_compression` for comparison.
+NVBENCH_BENCH_TYPES(BM_orc_chunked_read_io_compression, NVBENCH_TYPE_AXES(compression_list))
+  .set_name("orc_chunked_read_io_compression")
+  .set_type_axes_names({"compression"})
+  .set_min_samples(4)
+  // The input has approximately 520MB and 127K rows.
+  // The limits below are given in MBs.
+  .add_int64_axis("chunk_read_limit_MB", {50, 250, 700})
+  .add_int64_axis("pass_read_limit_MB", {50, 250, 700});
diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index 9aeb9ae4267..597ddd9cf0a 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -38,13 +38,15 @@ class chunked_orc_writer_options;
 
 namespace orc::detail {
 
+// Forward declaration of the internal reader class
+class reader_impl;
+
 /**
  * @brief Class to read ORC dataset data into columns.
  */
 class reader {
  private:
-  class impl;
-  std::unique_ptr<impl> _impl;
+  std::unique_ptr<reader_impl> _impl;
 
  public:
   /**
@@ -68,10 +70,63 @@ class reader {
   /**
    * @brief Reads the entire dataset.
    *
-   * @param options Settings for controlling reading behavior
    * @return The set of columns along with table metadata
    */
-  table_with_metadata read(orc_reader_options const& options);
+  table_with_metadata read();
+};
+
+/**
+ * @brief The reader class that supports iterative reading from an array of data sources.
+ */
+class chunked_reader {
+ private:
+  std::unique_ptr<reader_impl> _impl;
+
+ public:
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader(std::size_t, std::size_t, size_type,
+   * orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
+   *
+   * @param sources Input `datasource` objects to read the dataset from
+   */
+  explicit chunked_reader(std::size_t chunk_read_limit,
+                          std::size_t pass_read_limit,
+                          size_type output_row_granularity,
+                          std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
+                          orc_reader_options const& options,
+                          rmm::cuda_stream_view stream,
+                          rmm::device_async_resource_ref mr);
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader(std::size_t, std::size_t,
+   * orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
+   *
+   * @param sources Input `datasource` objects to read the dataset from
+   */
+  explicit chunked_reader(std::size_t chunk_read_limit,
+                          std::size_t pass_read_limit,
+                          std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
+                          orc_reader_options const& options,
+                          rmm::cuda_stream_view stream,
+                          rmm::device_async_resource_ref mr);
+
+  /**
+   * @brief Destructor explicitly-declared to avoid inlined in header.
+   *
+   * Since the declaration of the internal `_impl` object does not exist in this header, this
+   * destructor needs to be defined in a separate source file which can access to that object's
+   * declaration.
+   */
+  ~chunked_reader();
+
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::has_next
+   */
+  [[nodiscard]] bool has_next() const;
+
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::read_chunk
+   */
+  [[nodiscard]] table_with_metadata read_chunk() const;
 };
 
 /**
@@ -126,5 +181,6 @@ class writer {
    */
   void close();
 };
+
 }  // namespace orc::detail
 }  // namespace cudf::io
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index bceb258cb38..8140f8897b7 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -58,10 +58,10 @@ class orc_reader_options {
 
   // List of individual stripes to read (ignored if empty)
   std::vector<std::vector<size_type>> _stripes;
-  // Rows to skip from the start; ORC stores the number of rows as uint64_t
-  uint64_t _skip_rows = 0;
+  // Rows to skip from the start
+  int64_t _skip_rows = 0;
   // Rows to read; `nullopt` is all
-  std::optional<size_type> _num_rows;
+  std::optional<int64_t> _num_rows;
 
   // Whether to use row index to speed-up reading
   bool _use_index = true;
@@ -125,7 +125,7 @@ class orc_reader_options {
    *
    * @return Number of rows to skip from the start
    */
-  uint64_t get_skip_rows() const { return _skip_rows; }
+  int64_t get_skip_rows() const { return _skip_rows; }
 
   /**
    * @brief Returns number of row to read.
@@ -133,7 +133,7 @@ class orc_reader_options {
    * @return Number of rows to read; `nullopt` if the option hasn't been set (in which case the file
    * is read until the end)
    */
-  std::optional<size_type> const& get_num_rows() const { return _num_rows; }
+  std::optional<int64_t> const& get_num_rows() const { return _num_rows; }
 
   /**
    * @brief Whether to use row index to speed-up reading.
@@ -198,10 +198,10 @@ class orc_reader_options {
    * @throw cudf::logic_error if a negative value is passed
    * @throw cudf::logic_error if stripes have been previously set
    */
-  void set_skip_rows(uint64_t rows)
+  void set_skip_rows(int64_t rows)
   {
+    CUDF_EXPECTS(rows >= 0, "skip_rows cannot be negative");
     CUDF_EXPECTS(rows == 0 or _stripes.empty(), "Can't set both skip_rows along with stripes");
-    CUDF_EXPECTS(rows <= std::numeric_limits<int64_t>::max(), "skip_rows is too large");
     _skip_rows = rows;
   }
 
@@ -213,7 +213,7 @@ class orc_reader_options {
    * @throw cudf::logic_error if a negative value is passed
    * @throw cudf::logic_error if stripes have been previously set
    */
-  void set_num_rows(size_type nrows)
+  void set_num_rows(int64_t nrows)
   {
     CUDF_EXPECTS(nrows >= 0, "num_rows cannot be negative");
     CUDF_EXPECTS(_stripes.empty(), "Can't set both num_rows and stripes");
@@ -271,7 +271,7 @@ class orc_reader_options_builder {
    *
    * @param src The source information used to read orc file
    */
-  explicit orc_reader_options_builder(source_info src) : options{std::move(src)} {};
+  explicit orc_reader_options_builder(source_info src) : options{std::move(src)} {}
 
   /**
    * @brief Sets names of the column to read.
@@ -303,7 +303,7 @@ class orc_reader_options_builder {
    * @param rows Number of rows
    * @return this for chaining
    */
-  orc_reader_options_builder& skip_rows(uint64_t rows)
+  orc_reader_options_builder& skip_rows(int64_t rows)
   {
     options.set_skip_rows(rows);
     return *this;
@@ -315,7 +315,7 @@ class orc_reader_options_builder {
    * @param nrows Number of rows
    * @return this for chaining
    */
-  orc_reader_options_builder& num_rows(size_type nrows)
+  orc_reader_options_builder& num_rows(int64_t nrows)
   {
     options.set_num_rows(nrows);
     return *this;
@@ -406,6 +406,144 @@ table_with_metadata read_orc(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief The chunked orc reader class to read an ORC file iteratively into a series of
+ * tables, chunk by chunk.
+ *
+ * This class is designed to address the reading issue when reading very large ORC files such
+ * that sizes of their columns exceed the limit that can be stored in cudf columns. By reading the
+ * file content by chunks using this class, each chunk is guaranteed to have its size stay within
+ * the given limit.
+ */
+class chunked_orc_reader {
+ public:
+  /**
+   * @brief Default constructor, this should never be used.
+   *
+   * This is added just to satisfy cython.
+   */
+  chunked_orc_reader() = default;
+
+  /**
+   * @brief Construct the reader from input/output size limits, output row granularity, along with
+   * other ORC reader options.
+   *
+   * The typical usage should be similar to this:
+   * ```
+   *  do {
+   *    auto const chunk = reader.read_chunk();
+   *    // Process chunk
+   *  } while (reader.has_next());
+   *
+   * ```
+   *
+   * If `chunk_read_limit == 0` (i.e., no output limit) and `pass_read_limit == 0` (no temporary
+   * memory size limit), a call to `read_chunk()` will read the whole data source and return a table
+   * containing all rows.
+   *
+   * The `chunk_read_limit` parameter controls the size of the output table to be returned per
+   * `read_chunk()` call. If the user specifies a 100 MB limit, the reader will attempt to return
+   * tables that have a total bytes size (over all columns) of 100 MB or less.
+   * This is a soft limit and the code will not fail if it cannot satisfy the limit.
+   *
+   * The `pass_read_limit` parameter controls how much temporary memory is used in the entire
+   * process of loading, decompressing and decoding of data. Again, this is also a soft limit and
+   * the reader will try to make the best effort.
+   *
+   * Finally, the parameter `output_row_granularity` controls the changes in row number of the
+   * output chunk. For each call to `read_chunk()`, with respect to the given `pass_read_limit`, a
+   * subset of stripes may be loaded, decompressed and decoded into an intermediate table. The
+   * reader will then subdivide that table into smaller tables for final output using
+   * `output_row_granularity` as the subdivision step.
+   *
+   * @param chunk_read_limit Limit on total number of bytes to be returned per `read_chunk()` call,
+   *        or `0` if there is no limit
+   * @param pass_read_limit Limit on temporary memory usage for reading the data sources,
+   *        or `0` if there is no limit
+   * @param output_row_granularity The granularity parameter used for subdividing the decoded
+   *        table for final output
+   * @param options Settings for controlling reading behaviors
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
+   *
+   * @throw cudf::logic_error if `output_row_granularity` is non-positive
+   */
+  explicit chunked_orc_reader(
+    std::size_t chunk_read_limit,
+    std::size_t pass_read_limit,
+    size_type output_row_granularity,
+    orc_reader_options const& options,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+  /**
+   * @brief Construct the reader from input/output size limits along with other ORC reader options.
+   *
+   * This constructor implicitly call the other constructor with `output_row_granularity` set to
+   * `DEFAULT_OUTPUT_ROW_GRANULARITY` rows.
+   *
+   * @param chunk_read_limit Limit on total number of bytes to be returned per `read_chunk()` call,
+   *        or `0` if there is no limit
+   * @param pass_read_limit Limit on temporary memory usage for reading the data sources,
+   *        or `0` if there is no limit
+   * @param options Settings for controlling reading behaviors
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
+   */
+  explicit chunked_orc_reader(
+    std::size_t chunk_read_limit,
+    std::size_t pass_read_limit,
+    orc_reader_options const& options,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+  /**
+   * @brief Construct the reader from output size limits along with other ORC reader options.
+   *
+   * This constructor implicitly call the other constructor with `pass_read_limit` set to `0` and
+   * `output_row_granularity` set to `DEFAULT_OUTPUT_ROW_GRANULARITY` rows.
+   *
+   * @param chunk_read_limit Limit on total number of bytes to be returned per `read_chunk()` call,
+   *        or `0` if there is no limit
+   * @param options Settings for controlling reading behaviors
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
+   */
+  explicit chunked_orc_reader(
+    std::size_t chunk_read_limit,
+    orc_reader_options const& options,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+  /**
+   * @brief Destructor, destroying the internal reader instance.
+   */
+  ~chunked_orc_reader();
+
+  /**
+   * @brief Check if there is any data in the given data sources has not yet read.
+   *
+   * @return A boolean value indicating if there is any data left to read
+   */
+  [[nodiscard]] bool has_next() const;
+
+  /**
+   * @brief Read a chunk of rows in the given data sources.
+   *
+   * The sequence of returned tables, if concatenated by their order, guarantees to form a complete
+   * dataset as reading the entire given data sources at once.
+   *
+   * An empty table will be returned if the given sources are empty, or all the data has
+   * been read and returned by the previous calls.
+   *
+   * @return An output `cudf::table` along with its metadata
+   */
+  [[nodiscard]] table_with_metadata read_chunk() const;
+
+ private:
+  std::unique_ptr<cudf::io::orc::detail::chunked_reader> reader;
+};
+
 /** @} */  // end of group
 /**
  * @addtogroup io_writers
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 12059dffa4e..98b010109ec 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -420,7 +420,7 @@ table_with_metadata read_orc(orc_reader_options const& options,
 
   auto datasources = make_datasources(options.get_source());
   auto reader = std::make_unique<orc::detail::reader>(std::move(datasources), options, stream, mr);
-  return reader->read(options);
+  return reader->read();
 }
 
 /**
@@ -440,6 +440,64 @@ void write_orc(orc_writer_options const& options, rmm::cuda_stream_view stream)
   writer->write(options.get_table());
 }
 
+chunked_orc_reader::chunked_orc_reader(std::size_t chunk_read_limit,
+                                       std::size_t pass_read_limit,
+                                       size_type output_row_granularity,
+                                       orc_reader_options const& options,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr)
+  : reader{std::make_unique<orc::detail::chunked_reader>(chunk_read_limit,
+                                                         pass_read_limit,
+                                                         output_row_granularity,
+                                                         make_datasources(options.get_source()),
+                                                         options,
+                                                         stream,
+                                                         mr)}
+{
+}
+
+chunked_orc_reader::chunked_orc_reader(std::size_t chunk_read_limit,
+                                       std::size_t pass_read_limit,
+                                       orc_reader_options const& options,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr)
+  : reader{std::make_unique<orc::detail::chunked_reader>(chunk_read_limit,
+                                                         pass_read_limit,
+                                                         make_datasources(options.get_source()),
+                                                         options,
+                                                         stream,
+                                                         mr)}
+{
+}
+
+chunked_orc_reader::chunked_orc_reader(std::size_t chunk_read_limit,
+                                       orc_reader_options const& options,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr)
+  : chunked_orc_reader(chunk_read_limit, 0UL, options, stream, mr)
+{
+}
+
+// This destructor destroys the internal reader instance.
+// Since the declaration of the internal `reader` object does not exist in the header, this
+// destructor needs to be defined in a separate source file which can access to that object's
+// declaration.
+chunked_orc_reader::~chunked_orc_reader() = default;
+
+bool chunked_orc_reader::has_next() const
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(reader != nullptr, "Reader has not been constructed properly.");
+  return reader->has_next();
+}
+
+table_with_metadata chunked_orc_reader::read_chunk() const
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(reader != nullptr, "Reader has not been constructed properly.");
+  return reader->read_chunk();
+}
+
 /**
  * @copydoc cudf::io::orc_chunked_writer::orc_chunked_writer
  */
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index d54524f0f0d..94a4d146b35 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "aggregate_orc_metadata.hpp"
+#include "io/orc/aggregate_orc_metadata.hpp"
 
 #include "io/utilities/row_selection.hpp"
 
@@ -152,22 +152,28 @@ aggregate_orc_metadata::aggregate_orc_metadata(
   }
 }
 
-std::tuple<int64_t, size_type, std::vector<metadata::stripe_source_mapping>>
+std::tuple<int64_t, int64_t, std::vector<metadata::orc_stripe_info>>
 aggregate_orc_metadata::select_stripes(
   std::vector<std::vector<size_type>> const& user_specified_stripes,
   int64_t skip_rows,
-  std::optional<size_type> const& num_rows,
+  std::optional<size_type> const& num_read_rows,
   rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS((skip_rows == 0 and not num_rows.has_value()) or user_specified_stripes.empty(),
+  CUDF_EXPECTS((skip_rows == 0 and not num_read_rows.has_value()) or user_specified_stripes.empty(),
                "Can't use both the row selection and the stripe selection");
 
   auto [rows_to_skip, rows_to_read] = [&]() {
-    if (not user_specified_stripes.empty()) { return std::pair<int64_t, size_type>{0, 0}; }
-    return cudf::io::detail::skip_rows_num_rows_from_options(skip_rows, num_rows, get_num_rows());
+    if (not user_specified_stripes.empty()) { return std::pair<int64_t, int64_t>{0, 0}; }
+    return cudf::io::detail::skip_rows_num_rows_from_options(
+      skip_rows, num_read_rows, get_num_rows());
   }();
 
-  std::vector<metadata::stripe_source_mapping> selected_stripes_mapping;
+  struct stripe_source_mapping {
+    int source_idx;
+    std::vector<metadata::orc_stripe_info> stripe_info;
+  };
+
+  std::vector<stripe_source_mapping> selected_stripes_mapping;
 
   if (!user_specified_stripes.empty()) {
     CUDF_EXPECTS(user_specified_stripes.size() == per_file_metadata.size(),
@@ -176,7 +182,8 @@ aggregate_orc_metadata::select_stripes(
     // Each vector entry represents a source file; each nested vector represents the
     // user_defined_stripes to get from that source file
     for (size_t src_file_idx = 0; src_file_idx < user_specified_stripes.size(); ++src_file_idx) {
-      std::vector<OrcStripeInfo> stripe_infos;
+      std::vector<metadata::orc_stripe_info> stripe_infos;
+      stripe_infos.reserve(user_specified_stripes[src_file_idx].size());
 
       // Coalesce stripe info at the source file later since that makes downstream processing much
       // easier in impl::read
@@ -185,11 +192,19 @@ aggregate_orc_metadata::select_stripes(
           stripe_idx >= 0 and stripe_idx < static_cast<decltype(stripe_idx)>(
                                              per_file_metadata[src_file_idx].ff.stripes.size()),
           "Invalid stripe index");
-        stripe_infos.push_back(
-          std::pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
-        rows_to_read += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+        stripe_infos.push_back({&per_file_metadata[src_file_idx].ff.stripes[stripe_idx],
+                                nullptr,
+                                static_cast<int>(src_file_idx)});
+
+        auto const stripe_rows =
+          per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+        CUDF_EXPECTS(stripe_rows < static_cast<uint64_t>(std::numeric_limits<size_type>::max()),
+                     "The number of rows in one stripe exceeds the column size limit.",
+                     std::overflow_error);
+        rows_to_read += static_cast<int64_t>(stripe_rows);
       }
-      selected_stripes_mapping.push_back({static_cast<int>(src_file_idx), stripe_infos});
+      selected_stripes_mapping.emplace_back(
+        stripe_source_mapping{static_cast<int>(src_file_idx), std::move(stripe_infos)});
     }
   } else {
     int64_t count            = 0;
@@ -198,33 +213,44 @@ aggregate_orc_metadata::select_stripes(
     for (size_t src_file_idx = 0;
          src_file_idx < per_file_metadata.size() && count < rows_to_skip + rows_to_read;
          ++src_file_idx) {
-      std::vector<OrcStripeInfo> stripe_infos;
+      std::vector<metadata::orc_stripe_info> stripe_infos;
+      stripe_infos.reserve(per_file_metadata[src_file_idx].ff.stripes.size());
 
       for (size_t stripe_idx = 0; stripe_idx < per_file_metadata[src_file_idx].ff.stripes.size() &&
                                   count < rows_to_skip + rows_to_read;
            ++stripe_idx) {
-        count += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+        auto const stripe_rows =
+          per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+        CUDF_EXPECTS(stripe_rows < static_cast<uint64_t>(std::numeric_limits<size_type>::max()),
+                     "The number of rows in one stripe exceeds the column size limit.",
+                     std::overflow_error);
+        count += static_cast<int64_t>(stripe_rows);
+
         if (count > rows_to_skip || count == 0) {
-          stripe_infos.push_back(
-            std::pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
+          stripe_infos.push_back({&per_file_metadata[src_file_idx].ff.stripes[stripe_idx],
+                                  nullptr,
+                                  static_cast<int>(src_file_idx)});
         } else {
           stripe_skip_rows = count;
         }
       }
 
-      selected_stripes_mapping.push_back({static_cast<int>(src_file_idx), stripe_infos});
+      selected_stripes_mapping.emplace_back(
+        stripe_source_mapping{static_cast<int>(src_file_idx), std::move(stripe_infos)});
     }
     // Need to remove skipped rows from the stripes which are not selected.
     rows_to_skip -= stripe_skip_rows;
   }
 
+  std::vector<metadata::orc_stripe_info> output;
+
   // Read each stripe's stripefooter metadata
   for (auto& mapping : selected_stripes_mapping) {
     // Resize to all stripe_info for the source level
     per_file_metadata[mapping.source_idx].stripefooters.resize(mapping.stripe_info.size());
 
     for (size_t i = 0; i < mapping.stripe_info.size(); i++) {
-      auto const stripe         = mapping.stripe_info[i].first;
+      auto const stripe         = mapping.stripe_info[i].stripe_info;
       auto const sf_comp_offset = stripe->offset + stripe->indexLength + stripe->dataLength;
       auto const sf_comp_length = stripe->footerLength;
       CUDF_EXPECTS(
@@ -236,12 +262,17 @@ aggregate_orc_metadata::select_stripes(
         {buffer->data(), buffer->size()}, stream);
       ProtobufReader(sf_data.data(), sf_data.size())
         .read(per_file_metadata[mapping.source_idx].stripefooters[i]);
-      mapping.stripe_info[i].second = &per_file_metadata[mapping.source_idx].stripefooters[i];
+      mapping.stripe_info[i].stripe_footer =
+        &per_file_metadata[mapping.source_idx].stripefooters[i];
       if (stripe->indexLength == 0) { row_grp_idx_present = false; }
     }
+
+    output.insert(output.end(),
+                  std::make_move_iterator(mapping.stripe_info.begin()),
+                  std::make_move_iterator(mapping.stripe_info.end()));
   }
 
-  return {rows_to_skip, rows_to_read, selected_stripes_mapping};
+  return {rows_to_skip, rows_to_read, std::move(output)};
 }
 
 column_hierarchy aggregate_orc_metadata::select_columns(
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index d1e053be481..5da5af58b9b 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -45,8 +45,6 @@ struct column_hierarchy {
  * to aggregate that metadata from all the files.
  */
 class aggregate_orc_metadata {
-  using OrcStripeInfo = std::pair<StripeInformation const*, StripeFooter const*>;
-
   /**
    * @brief Sums up the number of rows of each source
    */
@@ -114,12 +112,22 @@ class aggregate_orc_metadata {
    * @brief Selects the stripes to read, based on the row/stripe selection parameters.
    *
    * Stripes are potentially selected from multiple files.
+   *
+   * Upon parsing stripes' information, the number of skip rows and reading rows are also updated
+   * to be matched with the actual numbers for reading stripes from data sources.
+   *
+   * @param user_specified_stripes The specified stripe indices to read
+   * @param skip_rows Number of rows to skip from reading
+   * @param num_read_rows Number of rows to read
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @return A tuple of the corrected skip_rows and num_rows values along with a vector of
+   *         stripes' metadata such as footer, data information, and source index
    */
-  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<metadata::stripe_source_mapping>>
-  select_stripes(std::vector<std::vector<size_type>> const& user_specified_stripes,
-                 int64_t skip_rows,
-                 std::optional<size_type> const& num_rows,
-                 rmm::cuda_stream_view stream);
+  [[nodiscard]] std::tuple<int64_t, int64_t, std::vector<metadata::orc_stripe_info>> select_stripes(
+    std::vector<std::vector<size_type>> const& user_specified_stripes,
+    int64_t skip_rows,
+    std::optional<size_type> const& num_read_rows,
+    rmm::cuda_stream_view stream);
 
   /**
    * @brief Filters ORC file to a selection of columns, based on their paths in the file.
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index 88bd260a598..fd55cbb6846 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -602,13 +602,13 @@ struct column_validity_info {
  * convenience methods for initializing and accessing metadata.
  */
 class metadata {
-  using OrcStripeInfo = std::pair<StripeInformation const*, StripeFooter const*>;
-
  public:
-  struct stripe_source_mapping {
+  struct orc_stripe_info {
+    StripeInformation const* stripe_info;
+    StripeFooter const* stripe_footer;
     int source_idx;
-    std::vector<OrcStripeInfo> stripe_info;
   };
+  std::vector<orc_stripe_info> stripe_info;
 
  public:
   explicit metadata(datasource* const src, rmm::cuda_stream_view stream);
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 77151f5b7b8..621d4c67691 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -14,42 +14,100 @@
  * limitations under the License.
  */
 
-#include "reader_impl.hpp"
-#include "reader_impl_chunking.hpp"
-#include "reader_impl_helpers.hpp"
+#include "io/orc/reader_impl.hpp"
+#include "io/orc/reader_impl_chunking.hpp"
+#include "io/orc/reader_impl_helpers.hpp"
 
-#include <rmm/resource_ref.hpp>
+#include <cudf/detail/copy.hpp>
+
+#include <algorithm>
 
 namespace cudf::io::orc::detail {
 
-reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                   orc_reader_options const& options,
-                   rmm::cuda_stream_view stream,
-                   rmm::device_async_resource_ref mr)
-  : _stream(stream),
-    _mr(mr),
-    _timestamp_type{options.get_timestamp_type()},
-    _use_index{options.is_enabled_use_index()},
-    _use_np_dtypes{options.is_enabled_use_np_dtypes()},
-    _decimal128_columns{options.get_decimal128_columns()},
-    _col_meta{std::make_unique<reader_column_meta>()},
-    _sources(std::move(sources)),
-    _metadata{_sources, stream},
-    _selected_columns{_metadata.select_columns(options.get_columns())}
+// This is just the proxy to call all other data preprocessing functions.
+void reader_impl::prepare_data(read_mode mode)
 {
+  // There are no columns in the table.
+  if (_selected_columns.num_levels() == 0) { return; }
+
+  // This will be no-op if it was called before.
+  preprocess_file(mode);
+
+  if (!_chunk_read_data.more_table_chunks_to_output()) {
+    if (!_chunk_read_data.more_stripes_to_decode() && _chunk_read_data.more_stripes_to_load()) {
+      // Only load stripe data if:
+      //  - There is more stripe to load, and
+      //  - All loaded stripes were decoded, and
+      //  - All the decoded results were output.
+      load_next_stripe_data(mode);
+    }
+    if (_chunk_read_data.more_stripes_to_decode()) {
+      // Only decompress/decode the loaded stripes if:
+      //  - There are loaded stripes that were not decoded yet, and
+      //  - All the decoded results were output.
+      decompress_and_decode_stripes(mode);
+    }
+  }
 }
 
-table_with_metadata reader::impl::read(int64_t skip_rows,
-                                       std::optional<size_type> const& num_rows_opt,
-                                       std::vector<std::vector<size_type>> const& stripes)
+table_with_metadata reader_impl::make_output_chunk()
 {
-  prepare_data(skip_rows, num_rows_opt, stripes);
-  return read_chunk_internal();
+  // There are no columns in the table.
+  if (_selected_columns.num_levels() == 0) { return {std::make_unique<table>(), table_metadata{}}; }
+
+  // If no rows or stripes to read, return empty columns.
+  if (!_chunk_read_data.more_table_chunks_to_output()) {
+    std::vector<std::unique_ptr<column>> out_columns;
+    auto out_metadata = get_meta_with_user_data();
+    std::transform(_selected_columns.levels[0].begin(),
+                   _selected_columns.levels[0].end(),
+                   std::back_inserter(out_columns),
+                   [&](auto const& col_meta) {
+                     out_metadata.schema_info.emplace_back("");
+                     return create_empty_column(col_meta.id,
+                                                _metadata,
+                                                _options.decimal128_columns,
+                                                _options.use_np_dtypes,
+                                                _options.timestamp_type,
+                                                out_metadata.schema_info.back(),
+                                                _stream);
+                   });
+    return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
+  }
+
+  auto const make_output_table = [&] {
+    if (_chunk_read_data.output_table_ranges.size() == 1) {
+      // Must change the index of the current output range such that calling `has_next()` after
+      // this will return the correct answer (`false`, since there is only one range).
+      _chunk_read_data.curr_output_table_range++;
+
+      // Just hand over the decoded table without slicing.
+      return std::move(_chunk_read_data.decoded_table);
+    }
+
+    // The range of rows in the decoded table to output.
+    auto const out_range =
+      _chunk_read_data.output_table_ranges[_chunk_read_data.curr_output_table_range++];
+    auto const out_tview = cudf::detail::slice(
+      _chunk_read_data.decoded_table->view(),
+      {static_cast<size_type>(out_range.begin), static_cast<size_type>(out_range.end)},
+      _stream)[0];
+    auto output = std::make_unique<table>(out_tview, _stream, _mr);
+
+    // If this is the last slice, we also delete the decoded table to free up memory.
+    if (!_chunk_read_data.more_table_chunks_to_output()) {
+      _chunk_read_data.decoded_table.reset(nullptr);
+    }
+
+    return output;
+  };
+
+  return {make_output_table(), table_metadata{_out_metadata} /*copy cached metadata*/};
 }
 
-table_metadata reader::impl::make_output_metadata()
+table_metadata reader_impl::get_meta_with_user_data()
 {
-  if (_output_metadata) { return table_metadata{*_output_metadata}; }
+  if (_meta_with_user_data) { return table_metadata{*_meta_with_user_data}; }
 
   // Copy user data to the output metadata.
   table_metadata out_metadata;
@@ -70,69 +128,126 @@ table_metadata reader::impl::make_output_metadata()
   out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(),
                             out_metadata.per_file_user_data[0].end()};
 
-  // Save the output table metadata into `_output_metadata` for reuse next time.
-  _output_metadata = std::make_unique<table_metadata>(out_metadata);
+  // Save the output table metadata into `_meta_with_user_data` for reuse next time.
+  _meta_with_user_data = std::make_unique<table_metadata>(out_metadata);
 
   return out_metadata;
 }
 
-table_with_metadata reader::impl::read_chunk_internal()
+reader_impl::reader_impl(std::vector<std::unique_ptr<datasource>>&& sources,
+                         orc_reader_options const& options,
+                         rmm::cuda_stream_view stream,
+                         rmm::device_async_resource_ref mr)
+  : reader_impl::reader_impl(0UL, 0UL, std::move(sources), options, stream, mr)
 {
-  // There is no columns in the table.
-  if (_selected_columns.num_levels() == 0) { return {std::make_unique<table>(), table_metadata{}}; }
+}
 
-  std::vector<std::unique_ptr<column>> out_columns;
-  auto out_metadata = make_output_metadata();
+reader_impl::reader_impl(std::size_t chunk_read_limit,
+                         std::size_t pass_read_limit,
+                         std::vector<std::unique_ptr<datasource>>&& sources,
+                         orc_reader_options const& options,
+                         rmm::cuda_stream_view stream,
+                         rmm::device_async_resource_ref mr)
+  : reader_impl::reader_impl(chunk_read_limit,
+                             pass_read_limit,
+                             DEFAULT_OUTPUT_ROW_GRANULARITY,
+                             std::move(sources),
+                             options,
+                             stream,
+                             mr)
+{
+}
 
-  // If no rows or stripes to read, return empty columns
-  if (_file_itm_data->rows_to_read == 0 || _file_itm_data->selected_stripes.empty()) {
-    std::transform(_selected_columns.levels[0].begin(),
-                   _selected_columns.levels[0].end(),
-                   std::back_inserter(out_columns),
-                   [&](auto const col_meta) {
-                     out_metadata.schema_info.emplace_back("");
-                     return create_empty_column(col_meta.id,
-                                                _metadata,
-                                                _decimal128_columns,
-                                                _use_np_dtypes,
-                                                _timestamp_type,
-                                                out_metadata.schema_info.back(),
-                                                _stream);
-                   });
-    return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
-  }
+reader_impl::reader_impl(std::size_t chunk_read_limit,
+                         std::size_t pass_read_limit,
+                         size_type output_row_granularity,
+                         std::vector<std::unique_ptr<datasource>>&& sources,
+                         orc_reader_options const& options,
+                         rmm::cuda_stream_view stream,
+                         rmm::device_async_resource_ref mr)
+  : _stream(stream),
+    _mr(mr),
+    _options{options.get_timestamp_type(),
+             options.is_enabled_use_index(),
+             options.is_enabled_use_np_dtypes(),
+             options.get_decimal128_columns(),
+             options.get_skip_rows(),
+             options.get_num_rows(),
+             options.get_stripes()},
+    _col_meta{std::make_unique<reader_column_meta>()},
+    _sources(std::move(sources)),
+    _metadata{_sources, stream},
+    _selected_columns{_metadata.select_columns(options.get_columns())},
+    _chunk_read_data{chunk_read_limit, pass_read_limit, output_row_granularity}
+{
+  // Selected columns at different levels of nesting are stored in different elements
+  // of `selected_columns`; thus, size == 1 means no nested columns.
+  CUDF_EXPECTS(_options.skip_rows == 0 or _selected_columns.num_levels() == 1,
+               "skip_rows is not supported by nested column");
+}
+
+table_with_metadata reader_impl::read()
+{
+  prepare_data(read_mode::READ_ALL);
+  return make_output_chunk();
+}
 
-  // Create columns from buffer with respective schema information.
-  std::transform(
-    _selected_columns.levels[0].begin(),
-    _selected_columns.levels[0].end(),
-    std::back_inserter(out_columns),
-    [&](auto const& orc_col_meta) {
-      out_metadata.schema_info.emplace_back("");
-      auto col_buffer = assemble_buffer(
-        orc_col_meta.id, 0, *_col_meta, _metadata, _selected_columns, _out_buffers, _stream, _mr);
-      return make_column(col_buffer, &out_metadata.schema_info.back(), std::nullopt, _stream);
-    });
-
-  return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
+bool reader_impl::has_next()
+{
+  prepare_data(read_mode::CHUNKED_READ);
+  return _chunk_read_data.has_next();
+}
+
+table_with_metadata reader_impl::read_chunk()
+{
+  prepare_data(read_mode::CHUNKED_READ);
+  return make_output_chunk();
 }
 
-// Forward to implementation
+chunked_reader::chunked_reader(std::size_t chunk_read_limit,
+                               std::size_t pass_read_limit,
+                               std::vector<std::unique_ptr<datasource>>&& sources,
+                               orc_reader_options const& options,
+                               rmm::cuda_stream_view stream,
+                               rmm::device_async_resource_ref mr)
+  : _impl{std::make_unique<reader_impl>(
+      chunk_read_limit, pass_read_limit, std::move(sources), options, stream, mr)}
+{
+}
+
+chunked_reader::chunked_reader(std::size_t chunk_read_limit,
+                               std::size_t pass_read_limit,
+                               size_type output_row_granularity,
+                               std::vector<std::unique_ptr<datasource>>&& sources,
+                               orc_reader_options const& options,
+                               rmm::cuda_stream_view stream,
+                               rmm::device_async_resource_ref mr)
+  : _impl{std::make_unique<reader_impl>(chunk_read_limit,
+                                        pass_read_limit,
+                                        output_row_granularity,
+                                        std::move(sources),
+                                        options,
+                                        stream,
+                                        mr)}
+{
+}
+
+chunked_reader::~chunked_reader() = default;
+
+bool chunked_reader::has_next() const { return _impl->has_next(); }
+
+table_with_metadata chunked_reader::read_chunk() const { return _impl->read_chunk(); }
+
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                orc_reader_options const& options,
                rmm::cuda_stream_view stream,
                rmm::device_async_resource_ref mr)
-  : _impl{std::make_unique<impl>(std::move(sources), options, stream, mr)}
+  : _impl{std::make_unique<reader_impl>(std::move(sources), options, stream, mr)}
 {
 }
 
-// Destructor within this translation unit
 reader::~reader() = default;
 
-// Forward to implementation
-table_with_metadata reader::read(orc_reader_options const& options)
-{
-  return _impl->read(options.get_skip_rows(), options.get_num_rows(), options.get_stripes());
-}
+table_with_metadata reader::read() { return _impl->read(); }
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 8b859da07e9..94b294087b8 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include "aggregate_orc_metadata.hpp"
-#include "io/utilities/column_buffer.hpp"
+#include "io/orc/aggregate_orc_metadata.hpp"
+#include "io/orc/reader_impl_chunking.hpp"
 
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/orc.hpp>
@@ -26,6 +26,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <io/utilities/column_buffer.hpp>
+
 #include <memory>
 #include <optional>
 #include <vector>
@@ -33,83 +35,169 @@
 namespace cudf::io::orc::detail {
 
 struct reader_column_meta;
-struct file_intermediate_data;
 
 /**
  * @brief Implementation for ORC reader.
  */
-class reader::impl {
+class reader_impl {
  public:
   /**
    * @brief Constructor from a dataset source with reader options.
    *
+   * This constructor will call the other constructor with `chunk_read_limit` and `pass_read_limit`
+   * set to `0` and `output_row_granularity` set to `DEFAULT_OUTPUT_ROW_GRANULARITY`.
+   *
    * @param sources Dataset sources
    * @param options Settings for controlling reading behavior
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
-  explicit impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                orc_reader_options const& options,
-                rmm::cuda_stream_view stream,
-                rmm::device_async_resource_ref mr);
+  explicit reader_impl(std::vector<std::unique_ptr<datasource>>&& sources,
+                       orc_reader_options const& options,
+                       rmm::cuda_stream_view stream,
+                       rmm::device_async_resource_ref mr);
 
   /**
-   * @brief Read an entire set or a subset of data and returns a set of columns
-   *
-   * @param skip_rows Number of rows to skip from the start
-   * @param num_rows_opt Optional number of rows to read, or `std::nullopt` to read all rows
-   * @param stripes Indices of individual stripes to load if non-empty
-   * @return The set of columns along with metadata
+   * @copydoc cudf::io::orc::detail::chunked_reader::chunked_reader(std::size_t, std::size_t,
+   * orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
+   */
+  explicit reader_impl(std::size_t chunk_read_limit,
+                       std::size_t pass_read_limit,
+                       std::vector<std::unique_ptr<datasource>>&& sources,
+                       orc_reader_options const& options,
+                       rmm::cuda_stream_view stream,
+                       rmm::device_async_resource_ref mr);
+
+  /**
+   * @copydoc cudf::io::orc::detail::chunked_reader::chunked_reader(std::size_t, std::size_t,
+   * size_type, orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
    */
-  table_with_metadata read(int64_t skip_rows,
-                           std::optional<size_type> const& num_rows_opt,
-                           std::vector<std::vector<size_type>> const& stripes);
+  explicit reader_impl(std::size_t chunk_read_limit,
+                       std::size_t pass_read_limit,
+                       size_type output_row_granularity,
+                       std::vector<std::unique_ptr<datasource>>&& sources,
+                       orc_reader_options const& options,
+                       rmm::cuda_stream_view stream,
+                       rmm::device_async_resource_ref mr);
+
+  /**
+   * @copydoc cudf::io::orc::detail::reader::read
+   */
+  table_with_metadata read();
+
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::has_next
+   */
+  bool has_next();
+
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::read_chunk
+   */
+  table_with_metadata read_chunk();
 
  private:
+  /**
+   * @brief The enum indicating whether the data sources are read all at once or chunk by chunk.
+   */
+  enum class read_mode { READ_ALL, CHUNKED_READ };
+
   /**
    * @brief Perform all the necessary data preprocessing before creating an output table.
    *
-   * @param skip_rows Number of rows to skip from the start
-   * @param num_rows_opt Optional number of rows to read, or `std::nullopt` to read all rows
-   * @param stripes Indices of individual stripes to load if non-empty
+   * This is the proxy to call all other data preprocessing functions, which are prerequisite
+   * for generating the output.
+   *
+   * @param mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  void prepare_data(int64_t skip_rows,
-                    std::optional<size_type> const& num_rows_opt,
-                    std::vector<std::vector<size_type>> const& stripes);
+  void prepare_data(read_mode mode);
 
   /**
-   * @brief Create the output table metadata from file metadata.
+   * @brief Perform a preprocessing step on the input data sources that executes exactly once
+   * for the entire duration of the reader.
    *
-   * @return Columns' metadata to output with the table read from file
+   * In this step, the metadata of all stripes in the data sources is parsed, and information about
+   * data streams of the selected columns in all stripes are generated. If the reader has a data
+   * read limit, sizes of these streams are used to split the list of all stripes into multiple
+   * subsets, each of which will be loaded into memory in the `load_next_stripe_data()` step. These
+   * subsets are computed such that memory usage will be kept to be around a fixed size limit.
+   *
+   * @param mode Value indicating if the data sources are read all at once or chunk by chunk
+   */
+  void preprocess_file(read_mode mode);
+
+  /**
+   * @brief Load stripes from the input data sources into memory.
+   *
+   * If there is a data read limit, only a subset of stripes are read at a time such that
+   * their total data size does not exceed a fixed size limit. Then, the data is probed to
+   * estimate its uncompressed sizes, which are in turn used to split that stripe subset into
+   * smaller subsets, each of which to be decompressed and decoded in the next step
+   * `decompress_and_decode_stripes()`. This is to ensure that loading data from data sources
+   * together with decompression and decoding will be capped around the given data read limit.
+   *
+   * @param mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  table_metadata make_output_metadata();
+  void load_next_stripe_data(read_mode mode);
 
   /**
-   * @brief Read a chunk of data from the input source and return an output table with metadata.
+   * @brief Decompress and decode stripe data in the internal buffers, and store the result into
+   * an intermediate table.
+   *
+   * This function expects that the other preprocessing steps (`global preprocess()` and
+   * `load_next_stripe_data()`) have already been done.
    *
-   * This function is called internally and expects all preprocessing steps have already been done.
+   * @param mode Value indicating if the data sources are read all at once or chunk by chunk
+   */
+  void decompress_and_decode_stripes(read_mode mode);
+
+  /**
+   * @brief Create the output table from the intermediate table and return it along with metadata.
    *
    * @return The output table along with columns' metadata
    */
-  table_with_metadata read_chunk_internal();
+  table_with_metadata make_output_chunk();
+
+  /**
+   * @brief Create the output table metadata storing user data in source metadata.
+   *
+   * @return Columns' user data to output with the table read from file
+   */
+  table_metadata get_meta_with_user_data();
 
   rmm::cuda_stream_view const _stream;
   rmm::device_async_resource_ref const _mr;
 
-  // Reader configs
-  data_type const _timestamp_type;  // Override output timestamp resolution
-  bool const _use_index;            // Enable or disable attempt to use row index for parsing
-  bool const _use_np_dtypes;        // Enable or disable the conversion to numpy-compatible dtypes
-  std::vector<std::string> const _decimal128_columns;   // Control decimals conversion
-  std::unique_ptr<reader_column_meta> const _col_meta;  // Track of orc mapping and child details
+  // Reader configs.
+  struct {
+    data_type timestamp_type;  // override output timestamp resolution
+    bool use_index;            // enable or disable attempt to use row index for parsing
+    bool use_np_dtypes;        // enable or disable the conversion to numpy-compatible dtypes
+    std::vector<std::string> decimal128_columns;  // control decimals conversion
 
-  // Intermediate data for internal processing.
+    // User specified reading rows/stripes selection.
+    int64_t const skip_rows;
+    std::optional<int64_t> num_read_rows;
+    std::vector<std::vector<size_type>> const selected_stripes;
+  } const _options;
+
+  // Intermediate data for reading.
+  std::unique_ptr<reader_column_meta> const _col_meta;  // Track of orc mapping and child details
   std::vector<std::unique_ptr<datasource>> const _sources;  // Unused but owns data for `_metadata`
   aggregate_orc_metadata _metadata;
   column_hierarchy const _selected_columns;  // Construct from `_metadata` thus declare after it
-  std::unique_ptr<file_intermediate_data> _file_itm_data;
-  std::unique_ptr<table_metadata> _output_metadata;
+  file_intermediate_data _file_itm_data;
+  chunk_read_data _chunk_read_data;
+
+  // Intermediate data for output.
+  std::unique_ptr<table_metadata> _meta_with_user_data;
+  table_metadata _out_metadata;
   std::vector<std::vector<cudf::io::detail::column_buffer>> _out_buffers;
+
+  // The default value used for subdividing the decoded table for final output.
+  // Larger values will reduce the computation time but will make the output table less granular.
+  // Smaller values (minimum is `1`) will increase the computation time but the output table will
+  // have size closer to the given `chunk_read_limit`.
+  static inline constexpr size_type DEFAULT_OUTPUT_ROW_GRANULARITY = 10'000;
 };
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
new file mode 100644
index 00000000000..5034aa14a95
--- /dev/null
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -0,0 +1,723 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "io/comp/gpuinflate.hpp"
+#include "io/orc/reader_impl.hpp"
+#include "io/orc/reader_impl_chunking.hpp"
+#include "io/orc/reader_impl_helpers.hpp"
+#include "io/utilities/hostdevice_span.hpp"
+
+#include <cudf/detail/timezone.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/device_buffer.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/scan.h>
+
+#include <algorithm>
+#include <tuple>
+
+namespace cudf::io::orc::detail {
+
+std::size_t gather_stream_info_and_column_desc(
+  std::size_t stripe_id,
+  std::size_t level,
+  orc::StripeInformation const* stripeinfo,
+  orc::StripeFooter const* stripefooter,
+  host_span<int const> orc2gdf,
+  host_span<orc::SchemaType const> types,
+  bool use_index,
+  bool apply_struct_map,
+  int64_t* num_dictionary_entries,
+  std::size_t* local_stream_order,
+  std::vector<orc_stream_info>* stream_info,
+  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>* chunks)
+{
+  CUDF_EXPECTS((stream_info == nullptr) ^ (chunks == nullptr),
+               "Either stream_info or chunks must be provided, but not both.");
+
+  std::size_t src_offset = 0;
+  std::size_t dst_offset = 0;
+
+  auto const get_stream_index_type = [](orc::StreamKind kind) {
+    switch (kind) {
+      case orc::DATA: return gpu::CI_DATA;
+      case orc::LENGTH:
+      case orc::SECONDARY: return gpu::CI_DATA2;
+      case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY;
+      case orc::PRESENT: return gpu::CI_PRESENT;
+      case orc::ROW_INDEX: return gpu::CI_INDEX;
+      default:
+        // Skip this stream as it's not strictly required
+        return gpu::CI_NUM_STREAMS;
+    }
+  };
+
+  for (auto const& stream : stripefooter->streams) {
+    if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
+      // Ignore reading this stream from source.
+      CUDF_LOG_WARN("Unexpected stream in the input ORC source. The stream will be ignored.");
+      src_offset += stream.length;
+      continue;
+    }
+
+    auto const column_id = *stream.column_id;
+    auto col             = orc2gdf[column_id];
+
+    if (col == -1 and apply_struct_map) {
+      // A struct-type column has no data itself, but rather child columns
+      // for each of its fields. There is only a PRESENT stream, which
+      // needs to be included for the reader.
+      auto const schema_type = types[column_id];
+      if (!schema_type.subtypes.empty() && schema_type.kind == orc::STRUCT &&
+          stream.kind == orc::PRESENT) {
+        for (auto const& idx : schema_type.subtypes) {
+          auto const child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
+          if (child_idx >= 0) {
+            col = child_idx;
+            if (chunks) {
+              auto& chunk                     = (*chunks)[stripe_id][col];
+              chunk.strm_id[gpu::CI_PRESENT]  = *local_stream_order;
+              chunk.strm_len[gpu::CI_PRESENT] = stream.length;
+            }
+          }
+        }
+      }
+    } else if (col != -1) {
+      if (chunks) {
+        if (src_offset >= stripeinfo->indexLength || use_index) {
+          auto const index_type = get_stream_index_type(stream.kind);
+          if (index_type < gpu::CI_NUM_STREAMS) {
+            auto& chunk                = (*chunks)[stripe_id][col];
+            chunk.strm_id[index_type]  = *local_stream_order;
+            chunk.strm_len[index_type] = stream.length;
+            // NOTE: skip_count field is temporarily used to track the presence of index streams
+            chunk.skip_count |= 1 << index_type;
+
+            if (index_type == gpu::CI_DICTIONARY) {
+              chunk.dictionary_start = *num_dictionary_entries;
+              chunk.dict_len         = stripefooter->columns[column_id].dictionarySize;
+              *num_dictionary_entries +=
+                static_cast<int64_t>(stripefooter->columns[column_id].dictionarySize);
+            }
+          }
+        }
+
+        (*local_stream_order)++;
+      } else {  // chunks == nullptr
+        stream_info->emplace_back(
+          orc_stream_info{stripeinfo->offset + src_offset,
+                          dst_offset,
+                          stream.length,
+                          stream_source_info{stripe_id, level, column_id, stream.kind}});
+      }
+
+      dst_offset += stream.length;
+    }
+    src_offset += stream.length;
+  }
+
+  return dst_offset;
+}
+
+template <typename T>
+std::vector<range> find_splits(host_span<T const> cumulative_sizes,
+                               std::size_t total_count,
+                               std::size_t size_limit)
+{
+  CUDF_EXPECTS(size_limit > 0, "Invalid size limit", std::invalid_argument);
+
+  std::vector<range> splits;
+  std::size_t cur_count{0};
+  int64_t cur_pos{0};
+  std::size_t cur_cumulative_size{0};
+
+  [[maybe_unused]] std::size_t cur_cumulative_rows{0};
+
+  auto const start = thrust::make_transform_iterator(
+    cumulative_sizes.begin(),
+    [&](auto const& size) { return size.size_bytes - cur_cumulative_size; });
+  auto const end = start + cumulative_sizes.size();
+
+  while (cur_count < total_count) {
+    int64_t split_pos = static_cast<int64_t>(
+      thrust::distance(start, thrust::lower_bound(thrust::seq, start + cur_pos, end, size_limit)));
+
+    // If we're past the end, or if the returned range has size exceeds the given size limit,
+    // move back one position.
+    if (split_pos >= static_cast<int64_t>(cumulative_sizes.size()) ||
+        (cumulative_sizes[split_pos].size_bytes > cur_cumulative_size + size_limit)) {
+      split_pos--;
+    }
+
+    if constexpr (std::is_same_v<T, cumulative_size_and_row>) {
+      // Similarly, while the returned range has total number of rows exceeds column size limit,
+      // move back one position.
+      while (split_pos > 0 && cumulative_sizes[split_pos].num_rows >
+                                cur_cumulative_rows +
+                                  static_cast<std::size_t>(std::numeric_limits<size_type>::max())) {
+        split_pos--;
+      }
+    }
+
+    // In case we have moved back too much in the steps above, far beyond the last split point, that
+    // means we could not find any range that has size fits within the given size limit.
+    // In such situations, we need to move forward until we move pass the last output range.
+    while (split_pos < (static_cast<int64_t>(cumulative_sizes.size()) - 1) &&
+           (split_pos < 0 || cumulative_sizes[split_pos].count <= cur_count)) {
+      split_pos++;
+    }
+
+    auto const start_count = cur_count;
+    cur_count              = cumulative_sizes[split_pos].count;
+    splits.emplace_back(range{start_count, cur_count});
+    cur_pos             = split_pos;
+    cur_cumulative_size = cumulative_sizes[split_pos].size_bytes;
+
+    if constexpr (std::is_same_v<T, cumulative_size_and_row>) {
+      cur_cumulative_rows = cumulative_sizes[split_pos].num_rows;
+    }
+  }
+
+  // If the last range has size smaller than `merge_threshold` the size of the second last one,
+  // merge it with the second last one.
+  // This is to prevent having the last range too small.
+  if (splits.size() > 1) {
+    double constexpr merge_threshold = 0.15;
+    if (auto const last = splits.back(), second_last = splits[splits.size() - 2];
+        last.size() <= static_cast<std::size_t>(merge_threshold * second_last.size())) {
+      splits.pop_back();
+      splits.back().end = last.end;
+    }
+  }
+
+  return splits;
+}
+
+// Since `find_splits` is a template function, we need to explicitly instantiate it so it can be
+// used outside of this TU.
+template std::vector<range> find_splits<cumulative_size>(host_span<cumulative_size const> sizes,
+                                                         std::size_t total_count,
+                                                         std::size_t size_limit);
+template std::vector<range> find_splits<cumulative_size_and_row>(
+  host_span<cumulative_size_and_row const> sizes, std::size_t total_count, std::size_t size_limit);
+
+// In this step, the metadata of all stripes in the data sources is parsed, and information about
+// data streams of the selected columns in all stripes are generated. If the reader has a data
+// read limit, sizes of these streams are used to split the list of all stripes into multiple
+// subsets, each of which will be loaded into memory in the `load_next_stripe_data()` step. These
+// subsets are computed such that memory usage will be kept to be around a fixed size limit.
+void reader_impl::preprocess_file(read_mode mode)
+{
+  if (_file_itm_data.global_preprocessed) { return; }
+  _file_itm_data.global_preprocessed = true;
+
+  //
+  // Load stripes' metadata:
+  //
+  std::tie(
+    _file_itm_data.rows_to_skip, _file_itm_data.rows_to_read, _file_itm_data.selected_stripes) =
+    _metadata.select_stripes(
+      _options.selected_stripes, _options.skip_rows, _options.num_read_rows, _stream);
+  if (!_file_itm_data.has_data()) { return; }
+
+  CUDF_EXPECTS(
+    mode == read_mode::CHUNKED_READ ||
+      _file_itm_data.rows_to_read <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
+    "READ_ALL mode does not support reading number of rows more than cudf's column size limit. "
+    "For reading large number of rows, please use chunked_reader.",
+    std::overflow_error);
+
+  auto const& selected_stripes = _file_itm_data.selected_stripes;
+  auto const num_total_stripes = selected_stripes.size();
+  auto const num_levels        = _selected_columns.num_levels();
+
+  // Set up table for converting timestamp columns from local to UTC time
+  _file_itm_data.tz_table = [&] {
+    auto const has_timestamp_column = std::any_of(
+      _selected_columns.levels.cbegin(), _selected_columns.levels.cend(), [&](auto const& col_lvl) {
+        return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto const& col_meta) {
+          return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
+        });
+      });
+
+    return has_timestamp_column ? cudf::detail::make_timezone_transition_table(
+                                    {}, selected_stripes[0].stripe_footer->writerTimezone, _stream)
+                                : std::make_unique<cudf::table>();
+  }();
+
+  //
+  // Pre allocate necessary memory for data processed in the other reading steps:
+  //
+  auto& stripe_data_read_ranges = _file_itm_data.stripe_data_read_ranges;
+  stripe_data_read_ranges.resize(num_total_stripes);
+
+  auto& lvl_stripe_data          = _file_itm_data.lvl_stripe_data;
+  auto& lvl_stripe_sizes         = _file_itm_data.lvl_stripe_sizes;
+  auto& lvl_stream_info          = _file_itm_data.lvl_stream_info;
+  auto& lvl_stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges;
+  auto& lvl_column_types         = _file_itm_data.lvl_column_types;
+  auto& lvl_nested_cols          = _file_itm_data.lvl_nested_cols;
+
+  lvl_stripe_data.resize(num_levels);
+  lvl_stripe_sizes.resize(num_levels);
+  lvl_stream_info.resize(num_levels);
+  lvl_stripe_stream_ranges.resize(num_levels);
+  lvl_column_types.resize(num_levels);
+  lvl_nested_cols.resize(num_levels);
+  _out_buffers.resize(num_levels);
+
+  auto& read_info = _file_itm_data.data_read_info;
+  auto& col_meta  = *_col_meta;
+
+  //
+  // Collect columns' types:
+  //
+  for (std::size_t level = 0; level < num_levels; ++level) {
+    lvl_stripe_sizes[level].resize(num_total_stripes);
+    lvl_stripe_stream_ranges[level].resize(num_total_stripes);
+
+    // Association between each ORC column and its cudf::column
+    col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
+
+    auto const& columns_level = _selected_columns.levels[level];
+    size_type col_id{0};
+
+    for (auto const& col : columns_level) {
+      // Map each ORC column to its column
+      col_meta.orc_col_map[level][col.id] = col_id++;
+
+      auto const col_type =
+        to_cudf_type(_metadata.get_col_type(col.id).kind,
+                     _options.use_np_dtypes,
+                     _options.timestamp_type.id(),
+                     to_cudf_decimal_type(_options.decimal128_columns, _metadata, col.id));
+      CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
+
+      auto& column_types = lvl_column_types[level];
+      auto& nested_cols  = lvl_nested_cols[level];
+
+      if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
+          col_type == type_id::DECIMAL128) {
+        // sign of the scale is changed since cuDF follows c++ libraries like CNL
+        // which uses negative scaling, but liborc and other libraries
+        // follow positive scaling.
+        auto const scale =
+          -static_cast<size_type>(_metadata.get_col_type(col.id).scale.value_or(0));
+        column_types.emplace_back(col_type, scale);
+      } else {
+        column_types.emplace_back(col_type);
+      }
+
+      // Map each ORC column to its column.
+      if (col_type == type_id::LIST or col_type == type_id::STRUCT) {
+        nested_cols.emplace_back(col);
+      }
+    }
+
+    // Try to reserve some memory, but the final size is unknown,
+    // since each column may have more than one stream.
+    auto const num_columns = columns_level.size();
+    lvl_stream_info[level].reserve(num_total_stripes * num_columns);
+    if (read_info.capacity() < num_total_stripes * num_columns) {
+      read_info.reserve(num_total_stripes * num_columns);
+    }
+  }
+
+  //
+  // Collect all data streams' information:
+  //
+
+  // Load all stripes if we are in READ_ALL mode or there is no read limit.
+  auto const load_all_stripes =
+    mode == read_mode::READ_ALL || _chunk_read_data.pass_read_limit == 0;
+
+  // Accumulate data size for data streams in each stripe, used for chunking.
+  // This will be used only for CHUNKED_READ mode when there is a read limit.
+  // Otherwise, we do not need this since we just load all stripes.
+  cudf::detail::hostdevice_vector<cumulative_size> total_stripe_sizes(
+    load_all_stripes ? std::size_t{0} : num_total_stripes, _stream);
+
+  for (std::size_t stripe_global_idx = 0; stripe_global_idx < num_total_stripes;
+       ++stripe_global_idx) {
+    auto const& stripe       = selected_stripes[stripe_global_idx];
+    auto const stripe_info   = stripe.stripe_info;
+    auto const stripe_footer = stripe.stripe_footer;
+
+    std::size_t this_stripe_size{0};
+    auto const last_read_size = read_info.size();
+    for (std::size_t level = 0; level < num_levels; ++level) {
+      auto& stream_info = _file_itm_data.lvl_stream_info[level];
+
+      auto stream_level_count = stream_info.size();
+      auto const stripe_level_size =
+        gather_stream_info_and_column_desc(stripe_global_idx,
+                                           level,
+                                           stripe_info,
+                                           stripe_footer,
+                                           col_meta.orc_col_map[level],
+                                           _metadata.get_types(),
+                                           false,  // use_index,
+                                           level == 0,
+                                           nullptr,  // num_dictionary_entries
+                                           nullptr,  // local_stream_order
+                                           &stream_info,
+                                           nullptr  // chunks
+        );
+
+      auto const is_stripe_data_empty = stripe_level_size == 0;
+      CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
+                   "Invalid index rowgroup stream data");
+
+      lvl_stripe_sizes[level][stripe_global_idx] = stripe_level_size;
+      this_stripe_size += stripe_level_size;
+
+      // Range of the streams in `stream_info` corresponding to this stripe at the current level.
+      lvl_stripe_stream_ranges[level][stripe_global_idx] =
+        range{stream_level_count, stream_info.size()};
+
+      // Coalesce consecutive streams into one read.
+      while (not is_stripe_data_empty and stream_level_count < stream_info.size()) {
+        auto const d_dst  = stream_info[stream_level_count].dst_pos;
+        auto const offset = stream_info[stream_level_count].offset;
+        auto len          = stream_info[stream_level_count].length;
+        stream_level_count++;
+
+        while (stream_level_count < stream_info.size() &&
+               stream_info[stream_level_count].offset == offset + len) {
+          len += stream_info[stream_level_count].length;
+          stream_level_count++;
+        }
+        read_info.emplace_back(stream_data_read_info{offset,
+                                                     d_dst,
+                                                     len,
+                                                     static_cast<std::size_t>(stripe.source_idx),
+                                                     stripe_global_idx,
+                                                     level});
+      }
+    }  // end loop level
+
+    if (!load_all_stripes) { total_stripe_sizes[stripe_global_idx] = {1, this_stripe_size}; }
+
+    // Range of all stream reads in `read_info` corresponding to this stripe, in all levels.
+    stripe_data_read_ranges[stripe_global_idx] = range{last_read_size, read_info.size()};
+  }
+
+  //
+  // Split range of all stripes into subranges that can be loaded separately while maintaining
+  // the memory usage under the given pass limit:
+  //
+
+  // Load range is reset to start from the first position in `load_stripe_ranges`.
+  _chunk_read_data.curr_load_stripe_range = 0;
+
+  if (load_all_stripes) {
+    _chunk_read_data.load_stripe_ranges = {range{0UL, num_total_stripes}};
+    return;
+  }
+
+  // Compute the prefix sum of stripes' data sizes.
+  total_stripe_sizes.host_to_device_async(_stream);
+  thrust::inclusive_scan(rmm::exec_policy_nosync(_stream),
+                         total_stripe_sizes.d_begin(),
+                         total_stripe_sizes.d_end(),
+                         total_stripe_sizes.d_begin(),
+                         cumulative_size_plus{});
+  total_stripe_sizes.device_to_host_sync(_stream);
+
+  auto const load_limit = [&] {
+    auto const tmp = static_cast<std::size_t>(_chunk_read_data.pass_read_limit *
+                                              chunk_read_data::load_limit_ratio);
+    // Make sure not to pass 0 byte limit (due to round-off) to `find_splits`.
+    return std::max(tmp, 1UL);
+  }();
+
+  _chunk_read_data.load_stripe_ranges =
+    find_splits<cumulative_size>(total_stripe_sizes, num_total_stripes, load_limit);
+}
+
+// If there is a data read limit, only a subset of stripes are read at a time such that
+// their total data size does not exceed a fixed size limit. Then, the data is probed to
+// estimate its uncompressed sizes, which are in turn used to split that stripe subset into
+// smaller subsets, each of which to be decompressed and decoded in the next step
+// `decompress_and_decode_stripes()`. This is to ensure that loading data from data sources
+// together with decompression and decoding will be capped around the given data read limit.
+void reader_impl::load_next_stripe_data(read_mode mode)
+{
+  if (!_file_itm_data.has_data()) { return; }
+
+  auto const load_stripe_range =
+    _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range++];
+  auto const stripe_start = load_stripe_range.begin;
+  auto const stripe_count = load_stripe_range.size();
+
+  auto& lvl_stripe_data = _file_itm_data.lvl_stripe_data;
+  auto const num_levels = _selected_columns.num_levels();
+
+  // Prepare the buffer to read raw data onto.
+  for (std::size_t level = 0; level < num_levels; ++level) {
+    auto& stripe_data = lvl_stripe_data[level];
+    stripe_data.resize(stripe_count);
+
+    for (std::size_t idx = 0; idx < stripe_count; ++idx) {
+      auto const stripe_size = _file_itm_data.lvl_stripe_sizes[level][idx + stripe_start];
+      stripe_data[idx]       = rmm::device_buffer(
+        cudf::util::round_up_safe(stripe_size, BUFFER_PADDING_MULTIPLE), _stream);
+    }
+  }
+
+  //
+  // Load stripe data into memory:
+  //
+
+  // If we load data from sources into host buffers, we need to transfer (async) data to device
+  // memory. Such host buffers need to be kept alive until we sync the transfers.
+  std::vector<std::unique_ptr<cudf::io::datasource::buffer>> host_read_buffers;
+
+  // If we load data directly from sources into device memory, the loads are also async.
+  // Thus, we need to make sure to sync all them at the end.
+  std::vector<std::pair<std::future<std::size_t>, std::size_t>> device_read_tasks;
+
+  // Range of the read info (offset, length) to read for the current being loaded stripes.
+  auto const [read_begin, read_end] =
+    merge_selected_ranges(_file_itm_data.stripe_data_read_ranges, load_stripe_range);
+
+  for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) {
+    auto const& read_info = _file_itm_data.data_read_info[read_idx];
+    auto const source_ptr = _metadata.per_file_metadata[read_info.source_idx].source;
+    auto const dst_base   = static_cast<uint8_t*>(
+      lvl_stripe_data[read_info.level][read_info.stripe_idx - stripe_start].data());
+
+    if (source_ptr->is_device_read_preferred(read_info.length)) {
+      device_read_tasks.push_back(
+        std::pair(source_ptr->device_read_async(
+                    read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream),
+                  read_info.length));
+
+    } else {
+      auto buffer = source_ptr->host_read(read_info.offset, read_info.length);
+      CUDF_EXPECTS(buffer->size() == read_info.length, "Unexpected discrepancy in bytes read.");
+      CUDF_CUDA_TRY(cudaMemcpyAsync(dst_base + read_info.dst_pos,
+                                    buffer->data(),
+                                    read_info.length,
+                                    cudaMemcpyDefault,
+                                    _stream.value()));
+      host_read_buffers.emplace_back(std::move(buffer));
+    }
+  }
+
+  if (host_read_buffers.size() > 0) {  // if there was host read
+    _stream.synchronize();
+    host_read_buffers.clear();  // its data was copied to device memory after stream sync
+  }
+  for (auto& task : device_read_tasks) {  // if there was device read
+    CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
+  }
+
+  // Compute number of rows in the loading stripes.
+  auto const num_loading_rows = std::accumulate(
+    _file_itm_data.selected_stripes.begin() + stripe_start,
+    _file_itm_data.selected_stripes.begin() + stripe_start + stripe_count,
+    std::size_t{0},
+    [](std::size_t count, const auto& stripe) { return count + stripe.stripe_info->numberOfRows; });
+
+  // Decoding range needs to be reset to start from the first position in `decode_stripe_ranges`.
+  _chunk_read_data.curr_decode_stripe_range = 0;
+
+  // The cudf's column size limit.
+  auto constexpr column_size_limit =
+    static_cast<std::size_t>(std::numeric_limits<size_type>::max());
+
+  // Decode all loaded stripes if there is no read limit, or if we are in READ_ALL mode,
+  // and the number of loading rows is less than the column size limit.
+  // In theory, we should just decode 'enough' stripes for output one table chunk, instead of
+  // decoding all stripes like this, for better load-balancing and reduce memory usage.
+  // However, we do not have any good way to know how many stripes are 'enough'.
+  if ((mode == read_mode::READ_ALL || _chunk_read_data.pass_read_limit == 0) &&
+      // In addition to read limit, we also need to check if the total number of
+      // rows in the loaded stripes exceeds the column size limit.
+      // If that is the case, we cannot decode all stripes at once into a cudf table.
+      num_loading_rows <= column_size_limit) {
+    _chunk_read_data.decode_stripe_ranges = {load_stripe_range};
+    return;
+  }
+
+  // From here, we have reading mode that is either:
+  // - CHUNKED_READ without read limit but the number of reading rows exceeds column size limit, or
+  // - CHUNKED_READ with a pass read limit.
+  // READ_ALL mode with number of rows more than cudf's column size limit should be handled early in
+  // `preprocess_file`. We just check again to make sure such situations never happen here.
+  CUDF_EXPECTS(
+    mode != read_mode::READ_ALL,
+    "READ_ALL mode does not support reading number of rows more than cudf's column size limit.");
+
+  // This is the post-processing step after we've done with splitting `load_stripe_range` into
+  // `decode_stripe_ranges`.
+  auto const add_range_offset = [stripe_start](std::vector<range>& new_ranges) {
+    // The split ranges always start from zero.
+    // We need to change these ranges to start from `stripe_start` which are the correct subranges
+    // of the current loaded stripe range.
+    for (auto& range : new_ranges) {
+      range.begin += stripe_start;
+      range.end += stripe_start;
+    }
+  };
+
+  // Optimized code path when we do not have any read limit but the number of rows in the
+  // loaded stripes exceeds column size limit.
+  // Note that the values `max_uncompressed_size` for each stripe are not computed here.
+  // Instead, they will be computed on the fly during decoding to avoid the overhead of
+  // storing and retrieving from memory.
+  if (_chunk_read_data.pass_read_limit == 0 && num_loading_rows > column_size_limit) {
+    std::vector<cumulative_size_and_row> cumulative_stripe_rows(stripe_count);
+    std::size_t rows{0};
+
+    for (std::size_t idx = 0; idx < stripe_count; ++idx) {
+      auto const& stripe     = _file_itm_data.selected_stripes[idx + stripe_start];
+      auto const stripe_info = stripe.stripe_info;
+      rows += stripe_info->numberOfRows;
+
+      // We will split stripe ranges based only on stripes' number of rows, not data size.
+      // Thus, we override the cumulative `size_bytes` using the prefix sum of rows in stripes and
+      // will use the column size limit as the split size limit.
+      cumulative_stripe_rows[idx] =
+        cumulative_size_and_row{idx + 1UL /*count*/, rows /*size_bytes*/, rows};
+    }
+
+    _chunk_read_data.decode_stripe_ranges =
+      find_splits<cumulative_size_and_row>(cumulative_stripe_rows, stripe_count, column_size_limit);
+    add_range_offset(_chunk_read_data.decode_stripe_ranges);
+    return;
+  }
+
+  //
+  // Split range of loaded stripes into subranges that can be decoded separately such that the
+  // memory usage is maintained around the given limit:
+  //
+
+  // This is for estimating the decompressed sizes of the loaded stripes.
+  cudf::detail::hostdevice_vector<cumulative_size_and_row> stripe_decomp_sizes(stripe_count,
+                                                                               _stream);
+
+  // Fill up the `cumulative_size_and_row` array with initial values.
+  // Note: `hostdevice_vector::begin()` mirrors `std::vector::data()` using incorrect API name.
+  for (std::size_t idx = 0; idx < stripe_count; ++idx) {
+    auto const& stripe     = _file_itm_data.selected_stripes[idx + stripe_start];
+    auto const stripe_info = stripe.stripe_info;
+    stripe_decomp_sizes[idx] =
+      cumulative_size_and_row{1UL /*count*/, 0UL /*size_bytes*/, stripe_info->numberOfRows};
+  }
+
+  auto& compinfo_map = _file_itm_data.compinfo_map;
+  compinfo_map.clear();  // clear cache of the last load
+
+  // For parsing decompression data.
+  // We create an array that is large enough to use for all levels, thus only need to allocate
+  // memory once.
+  auto hd_compinfo = [&] {
+    std::size_t max_num_streams{0};
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      // Find the maximum number of streams in all levels of the loaded stripes.
+      for (std::size_t level = 0; level < num_levels; ++level) {
+        auto const stream_range =
+          merge_selected_ranges(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range);
+        max_num_streams = std::max(max_num_streams, stream_range.size());
+      }
+    }
+    return cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo>(max_num_streams, _stream);
+  }();
+
+  for (std::size_t level = 0; level < num_levels; ++level) {
+    auto const& stream_info = _file_itm_data.lvl_stream_info[level];
+    auto const num_columns  = _selected_columns.levels[level].size();
+
+    auto& stripe_data = lvl_stripe_data[level];
+    if (stripe_data.empty()) { continue; }
+
+    // Range of all streams in the loaded stripes.
+    auto const stream_range =
+      merge_selected_ranges(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range);
+
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
+
+      auto compinfo = cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>(
+        hd_compinfo.begin(), hd_compinfo.d_begin(), stream_range.size());
+      for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
+        auto const& info = stream_info[stream_idx];
+        auto const dst_base =
+          static_cast<uint8_t const*>(stripe_data[info.source.stripe_idx - stripe_start].data());
+        compinfo[stream_idx - stream_range.begin] =
+          gpu::CompressedStreamInfo(dst_base + info.dst_pos, info.length);
+      }
+
+      // Estimate the uncompressed data.
+      compinfo.host_to_device_async(_stream);
+      gpu::ParseCompressedStripeData(compinfo.device_ptr(),
+                                     compinfo.size(),
+                                     decompressor.GetBlockSize(),
+                                     decompressor.GetLog2MaxCompressionRatio(),
+                                     _stream);
+      compinfo.device_to_host_sync(_stream);
+
+      for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
+        auto const& info           = stream_info[stream_idx];
+        auto const stream_compinfo = compinfo[stream_idx - stream_range.begin];
+
+        // Cache these parsed numbers so they can be reused in the decompression/decoding step.
+        compinfo_map[info.source] = {stream_compinfo.num_compressed_blocks,
+                                     stream_compinfo.num_uncompressed_blocks,
+                                     stream_compinfo.max_uncompressed_size};
+        stripe_decomp_sizes[info.source.stripe_idx - stripe_start].size_bytes +=
+          stream_compinfo.max_uncompressed_size;
+      }
+
+    } else {  // no decompression
+      // Set decompression sizes equal to the input sizes.
+      for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
+        auto const& info = stream_info[stream_idx];
+        stripe_decomp_sizes[info.source.stripe_idx - stripe_start].size_bytes += info.length;
+      }
+    }
+  }  // end loop level
+
+  // Compute the prefix sum of stripe data sizes and rows.
+  stripe_decomp_sizes.host_to_device_async(_stream);
+  thrust::inclusive_scan(rmm::exec_policy_nosync(_stream),
+                         stripe_decomp_sizes.d_begin(),
+                         stripe_decomp_sizes.d_end(),
+                         stripe_decomp_sizes.d_begin(),
+                         cumulative_size_plus{});
+  stripe_decomp_sizes.device_to_host_sync(_stream);
+
+  auto const decode_limit = [&] {
+    auto const tmp = static_cast<std::size_t>(_chunk_read_data.pass_read_limit *
+                                              chunk_read_data::decompress_and_decode_limit_ratio);
+    // Make sure not to pass 0 byte limit to `find_splits`.
+    return std::max(tmp, 1UL);
+  }();
+
+  _chunk_read_data.decode_stripe_ranges =
+    find_splits<cumulative_size_and_row>(stripe_decomp_sizes, stripe_count, decode_limit);
+
+  add_range_offset(_chunk_read_data.decode_stripe_ranges);
+}
+
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 0ad0f9af589..4ef68ee8d86 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -24,18 +24,298 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <unordered_map>
+
 namespace cudf::io::orc::detail {
 
 /**
- * @brief Struct to store file-level data that remains constant for all chunks being read.
+ * @brief Struct representing a range of of data offsets.
+ */
+struct range {
+  std::size_t begin{0};
+  std::size_t end{0};
+
+  [[nodiscard]] auto size() const { return end - begin; }
+};
+
+/**
+ * @brief Expand a range of ranges into a simple range of data.
+ *
+ * @param input_ranges The list of all data ranges
+ * @param selected_ranges A range of ranges from `input_ranges`
+ * @return The range of data span by the selected range of ranges
+ */
+inline range merge_selected_ranges(host_span<range const> input_ranges,
+                                   range const& selected_ranges)
+{
+  // The first and last range.
+  auto const& first_range = input_ranges[selected_ranges.begin];
+  auto const& last_range  = input_ranges[selected_ranges.end - 1];
+
+  // The range of data covered from the first to the last range.
+  return {first_range.begin, last_range.end};
+}
+
+// Store information to identify where to read a chunk of data from source.
+// Each read corresponds to one or more consecutive streams combined.
+struct stream_data_read_info {
+  uint64_t offset;         // offset in data source
+  std::size_t dst_pos;     // offset to store data in memory relative to start of raw stripe data
+  std::size_t length;      // data length to read
+  std::size_t source_idx;  // the data source id
+  std::size_t stripe_idx;  // global stripe index
+  std::size_t level;       // nested level
+};
+
+/**
+ * @brief Compression information for a stripe at a specific nested level.
+ */
+struct stripe_level_comp_info {
+  std::size_t num_compressed_blocks{0};
+  std::size_t num_uncompressed_blocks{0};
+  std::size_t total_decomp_size{0};
+};
+
+/**
+ * @brief Struct that stores source information of an ORC streams.
+ */
+struct stream_source_info {
+  std::size_t stripe_idx;  // global stripe id throughout all data sources
+  std::size_t level;       // level of the nested column
+  uint32_t orc_col_idx;    // orc column id
+  StreamKind kind;         // stream kind
+
+  struct hash {
+    std::size_t operator()(stream_source_info const& id) const
+    {
+      auto const col_kind =
+        static_cast<std::size_t>(id.orc_col_idx) | (static_cast<std::size_t>(id.kind) << 32);
+      auto const hasher = std::hash<size_t>{};
+      return hasher(id.stripe_idx) ^ hasher(id.level) ^ hasher(col_kind);
+    }
+  };
+  struct equal_to {
+    bool operator()(stream_source_info const& lhs, stream_source_info const& rhs) const
+    {
+      return lhs.stripe_idx == rhs.stripe_idx && lhs.level == rhs.level &&
+             lhs.orc_col_idx == rhs.orc_col_idx && lhs.kind == rhs.kind;
+    }
+  };
+};
+
+/**
+ * @brief Map to lookup a value from stream source.
+ */
+template <typename T>
+using stream_source_map =
+  std::unordered_map<stream_source_info, T, stream_source_info::hash, stream_source_info::equal_to>;
+
+/**
+ * @brief Struct that stores information of an ORC stream.
+ */
+struct orc_stream_info {
+  // Data info:
+  uint64_t offset;      // offset in data source
+  std::size_t dst_pos;  // offset to store data in memory relative to start of raw stripe data
+  std::size_t length;   // stream length to read
+
+  // Store source of the stream in the stripe, so we can look up where this stream comes from.
+  stream_source_info source;
+};
+
+/**
+ * @brief Struct storing intermediate processing data loaded from data sources.
  */
 struct file_intermediate_data {
+  int64_t rows_to_skip;
+  int64_t rows_to_read;
+  std::vector<metadata::orc_stripe_info> selected_stripes;
+
+  // Check if there is data to read.
+  bool has_data() const { return rows_to_read > 0 && !selected_stripes.empty(); }
+
+  // For each stripe, we perform a number of reads for its streams.
+  // Those reads are identified by a chunk of consecutive read info stored in `data_read_info`.
+  std::vector<range> stripe_data_read_ranges;
+
+  // Identify what data to read from source.
+  std::vector<stream_data_read_info> data_read_info;
+
+  // Store the compression information for each data stream.
+  stream_source_map<stripe_level_comp_info> compinfo_map;
+
+  // Store info for each ORC stream at each nested level.
+  std::vector<std::vector<orc_stream_info>> lvl_stream_info;
+
+  // At each nested level, the streams for each stripe are stored consecutively in lvl_stream_info.
+  // This is used to identify the range of streams for each stripe from that vector.
+  std::vector<std::vector<range>> lvl_stripe_stream_ranges;
+
+  // The buffers to store raw data read from disk, initialized for each reading stripe chunks.
+  // After decoding, such buffers can be released.
+  // This can only be implemented after chunked output is ready.
   std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;
-  std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
 
-  int64_t rows_to_skip;
-  size_type rows_to_read;
-  std::vector<metadata::stripe_source_mapping> selected_stripes;
+  // Store the size of each stripe at each nested level.
+  // This is used to initialize the stripe_data buffers.
+  std::vector<std::vector<std::size_t>> lvl_stripe_sizes;
+
+  // List of column data types at each nested level.
+  std::vector<std::vector<data_type>> lvl_column_types;
+
+  // List of nested type columns at each nested level.
+  std::vector<std::vector<orc_column_meta>> lvl_nested_cols;
+
+  // Table for converting timestamp columns from local to UTC time.
+  std::unique_ptr<cudf::table> tz_table;
+
+  bool global_preprocessed{false};
+};
+
+/**
+ * @brief Struct collecting data necessary for chunked reading.
+ */
+struct chunk_read_data {
+  explicit chunk_read_data(std::size_t output_size_limit_,
+                           std::size_t data_read_limit_,
+                           size_type output_row_granularity_)
+    : chunk_read_limit{output_size_limit_},
+      pass_read_limit{data_read_limit_},
+      output_row_granularity{output_row_granularity_}
+  {
+    CUDF_EXPECTS(output_row_granularity > 0,
+                 "The value of `output_row_granularity` must be positive.");
+  }
+
+  std::size_t const
+    chunk_read_limit;  // maximum size (in bytes) of an output chunk, or 0 for no limit
+  std::size_t const pass_read_limit;  // approximate maximum size (in bytes) used for store
+                                      // intermediate data, or 0 for no limit
+  size_type const output_row_granularity;
+
+  // Memory limits for loading data and decoding are computed as
+  // `*_limit_ratio * pass_read_limit`.
+  // This is to maintain the total memory usage to be **around** the given `pass_read_limit`.
+  // Note that sum of these limits may not be `1.0`, and their values are set empirically.
+  static double constexpr load_limit_ratio{0.25};
+  static double constexpr decompress_and_decode_limit_ratio{0.6};
+
+  // Chunks of stripes that can be loaded into memory such that their data size is within the user
+  // specified limit.
+  std::vector<range> load_stripe_ranges;
+  std::size_t curr_load_stripe_range{0};
+  bool more_stripes_to_load() const { return curr_load_stripe_range < load_stripe_ranges.size(); }
+
+  // Chunks of stripes such that their decompression size is within the user specified size limit.
+  std::vector<range> decode_stripe_ranges;
+  std::size_t curr_decode_stripe_range{0};
+  bool more_stripes_to_decode() const
+  {
+    return curr_decode_stripe_range < decode_stripe_ranges.size();
+  }
+
+  // Chunk of rows in the internal decoded table to output for each `read_chunk()`.
+  std::vector<range> output_table_ranges;
+  std::size_t curr_output_table_range{0};
+  std::unique_ptr<cudf::table> decoded_table;
+  bool more_table_chunks_to_output() const
+  {
+    return curr_output_table_range < output_table_ranges.size();
+  }
+
+  bool has_next() const
+  {
+    // Only has more chunk to output if:
+    return more_stripes_to_load() || more_stripes_to_decode() || more_table_chunks_to_output();
+  }
+};
+
+/**
+ * @brief Struct to accumulate counts and sizes of some types such as stripes or rows.
+ */
+struct cumulative_size {
+  std::size_t count{0};
+  std::size_t size_bytes{0};
 };
 
+/**
+ * @brief Struct to accumulate counts, sizes, and number of rows of some types such as stripes or
+ * rows in tables.
+ */
+struct cumulative_size_and_row : public cumulative_size {
+  std::size_t num_rows{0};
+};
+
+/**
+ * @brief Functor to sum up cumulative data.
+ */
+struct cumulative_size_plus {
+  __device__ cumulative_size operator()(cumulative_size const& a, cumulative_size const& b) const
+  {
+    return cumulative_size{a.count + b.count, a.size_bytes + b.size_bytes};
+  }
+
+  __device__ cumulative_size_and_row operator()(cumulative_size_and_row const& a,
+                                                cumulative_size_and_row const& b) const
+  {
+    return cumulative_size_and_row{
+      a.count + b.count, a.size_bytes + b.size_bytes, a.num_rows + b.num_rows};
+  }
+};
+
+/**
+ * @brief Find the splits of the input data such that each split range has cumulative size less than
+ * a given `size_limit`.
+ *
+ * Note that the given limit is just a soft limit. The function will always output ranges that
+ * have at least one count, even such ranges have sizes exceed the value of `size_limit`.
+ *
+ * @param cumulative_sizes The input cumulative sizes to compute split ranges
+ * @param total_count The total count in the entire input
+ * @param size_limit The given soft limit to compute splits; must be positive
+ * @return A vector of ranges as splits of the input
+ */
+template <typename T>
+std::vector<range> find_splits(host_span<T const> cumulative_sizes,
+                               std::size_t total_count,
+                               std::size_t size_limit);
+
+/**
+ * @brief Function that populates descriptors for either individual streams or chunks of column
+ * data, but not both.
+ *
+ * This function is firstly used in the global step, to gather information for streams of all
+ * stripes in the data sources (when `stream_info` is present). Later on, it is used again to
+ * populate column descriptors (`chunks` is present) during decompression and decoding. The two
+ * steps share most of the execution path thus this function takes mutually exclusive parameters
+ * `stream_info` or `chunks` depending on each use case.
+ *
+ * @param stripe_id The index of the current stripe, can be global index or local decoding index
+ * @param level The current processing nested level
+ * @param stripeinfo The pointer to current stripe's information
+ * @param stripefooter The pointer to current stripe's footer
+ * @param orc2gdf The mapping from ORC column ids to gdf column ids
+ * @param types The schema type
+ * @param use_index Whether to use the row index for parsing
+ * @param apply_struct_map Indicating if this is the root level
+ * @param num_dictionary_entries The number of dictionary entries
+ * @param local_stream_order For retrieving 0-based orders of streams in the decoding step
+ * @param stream_info The vector of streams' information
+ * @param chunks The vector of column descriptors
+ * @return The number of bytes in the gathered streams
+ */
+std::size_t gather_stream_info_and_column_desc(
+  std::size_t stripe_id,
+  std::size_t level,
+  orc::StripeInformation const* stripeinfo,
+  orc::StripeFooter const* stripefooter,
+  host_span<int const> orc2gdf,
+  host_span<orc::SchemaType const> types,
+  bool use_index,
+  bool apply_struct_map,
+  int64_t* num_dictionary_entries,
+  std::size_t* local_stream_order,
+  std::vector<orc_stream_info>* stream_info,
+  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>* chunks);
+
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_decode.cu
similarity index 56%
rename from cpp/src/io/orc/reader_impl_preprocess.cu
rename to cpp/src/io/orc/reader_impl_decode.cu
index 04cb223c696..ec936b85761 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -16,17 +16,17 @@
 
 #include "io/comp/gpuinflate.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
+#include "io/orc/reader_impl.hpp"
+#include "io/orc/reader_impl_chunking.hpp"
+#include "io/orc/reader_impl_helpers.hpp"
 #include "io/utilities/config_utils.hpp"
-#include "reader_impl.hpp"
-#include "reader_impl_chunking.hpp"
-#include "reader_impl_helpers.hpp"
+#include "io/utilities/hostdevice_span.hpp"
 
-#include <cudf/detail/timezone.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table.hpp>
-#include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -45,175 +45,104 @@
 #include <thrust/transform.h>
 
 #include <algorithm>
-#include <iterator>
+#include <numeric>
 
 namespace cudf::io::orc::detail {
 
 namespace {
 
 /**
- * @brief Struct that maps ORC streams to columns
- */
-struct orc_stream_info {
-  explicit orc_stream_info(uint64_t offset_,
-                           std::size_t dst_pos_,
-                           uint32_t length_,
-                           uint32_t stripe_idx_)
-    : offset(offset_), dst_pos(dst_pos_), length(length_), stripe_idx(stripe_idx_)
-  {
-  }
-  uint64_t offset;      // offset in file
-  std::size_t dst_pos;  // offset in memory relative to start of compressed stripe data
-  std::size_t length;   // length in file
-  uint32_t stripe_idx;  // stripe index
-};
-
-/**
- * @brief Function that populates column descriptors stream/chunk
- */
-std::size_t gather_stream_info(std::size_t stripe_index,
-                               orc::StripeInformation const* stripeinfo,
-                               orc::StripeFooter const* stripefooter,
-                               host_span<int const> orc2gdf,
-                               host_span<orc::SchemaType const> types,
-                               bool use_index,
-                               bool apply_struct_map,
-                               int64_t* num_dictionary_entries,
-                               std::vector<orc_stream_info>& stream_info,
-                               cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks)
-{
-  uint64_t src_offset = 0;
-  uint64_t dst_offset = 0;
-
-  auto const get_stream_index_type = [](orc::StreamKind kind) {
-    switch (kind) {
-      case orc::DATA: return gpu::CI_DATA;
-      case orc::LENGTH:
-      case orc::SECONDARY: return gpu::CI_DATA2;
-      case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY;
-      case orc::PRESENT: return gpu::CI_PRESENT;
-      case orc::ROW_INDEX: return gpu::CI_INDEX;
-      default:
-        // Skip this stream as it's not strictly required
-        return gpu::CI_NUM_STREAMS;
-    }
-  };
-
-  for (auto const& stream : stripefooter->streams) {
-    if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
-      // Ignore reading this stream from source.
-      cudf::logger().warn("Unexpected stream in the input ORC source. The stream will be ignored.");
-      src_offset += stream.length;
-      continue;
-    }
-
-    auto const column_id = *stream.column_id;
-    auto col             = orc2gdf[column_id];
-
-    if (col == -1 and apply_struct_map) {
-      // A struct-type column has no data itself, but rather child columns
-      // for each of its fields. There is only a PRESENT stream, which
-      // needs to be included for the reader.
-      auto const schema_type = types[column_id];
-      if (not schema_type.subtypes.empty()) {
-        if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) {
-          for (auto const& idx : schema_type.subtypes) {
-            auto child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
-            if (child_idx >= 0) {
-              col                             = child_idx;
-              auto& chunk                     = chunks[stripe_index][col];
-              chunk.strm_id[gpu::CI_PRESENT]  = stream_info.size();
-              chunk.strm_len[gpu::CI_PRESENT] = stream.length;
-            }
-          }
-        }
-      }
-    } else if (col != -1) {
-      if (src_offset >= stripeinfo->indexLength || use_index) {
-        auto& chunk           = chunks[stripe_index][col];
-        auto const index_type = get_stream_index_type(stream.kind);
-        if (index_type < gpu::CI_NUM_STREAMS) {
-          chunk.strm_id[index_type]  = stream_info.size();
-          chunk.strm_len[index_type] = stream.length;
-          // NOTE: skip_count field is temporarily used to track the presence of index streams
-          chunk.skip_count |= 1 << index_type;
-
-          if (index_type == gpu::CI_DICTIONARY) {
-            chunk.dictionary_start = *num_dictionary_entries;
-            chunk.dict_len         = stripefooter->columns[column_id].dictionarySize;
-            *num_dictionary_entries += stripefooter->columns[column_id].dictionarySize;
-          }
-        }
-      }
-      stream_info.emplace_back(
-        stripeinfo->offset + src_offset, dst_offset, stream.length, stripe_index);
-      dst_offset += stream.length;
-    }
-    src_offset += stream.length;
-  }
-
-  return dst_offset;
-}
-
-/**
- * @brief Decompresses the stripe data, at stream granularity.
+ * @brief  Decompresses the stripe data, at stream granularity.
+ *
+ * Only the streams in the provided `stream_range` are decoded. That range is determined in
+ * the previous steps, after splitting stripes into ranges to maintain memory usage to be
+ * under data read limit.
  *
+ * @param loaded_stripe_range Range of stripes that are already loaded in memory
+ * @param stream_range Range of streams to be decoded
+ * @param num_decode_stripes Number of stripes that the decoding streams belong to
+ * @param compinfo_map A map to lookup compression info of streams
  * @param decompressor Block decompressor
  * @param stripe_data List of source stripe column data
  * @param stream_info List of stream to column mappings
  * @param chunks Vector of list of column chunk descriptors
  * @param row_groups Vector of list of row index descriptors
- * @param num_stripes Number of stripes making up column chunks
  * @param row_index_stride Distance between each row index
  * @param use_base_stride Whether to use base stride obtained from meta or use the computed value
  * @param stream CUDA stream used for device memory operations and kernel launches
- * @return Device buffer to decompressed page data
+ * @return Device buffer to decompressed data
  */
 rmm::device_buffer decompress_stripe_data(
+  range const& loaded_stripe_range,
+  range const& stream_range,
+  std::size_t num_decode_stripes,
+  cudf::detail::hostdevice_span<gpu::CompressedStreamInfo> compinfo,
+  stream_source_map<stripe_level_comp_info> const& compinfo_map,
   OrcDecompressor const& decompressor,
   host_span<rmm::device_buffer const> stripe_data,
-  host_span<orc_stream_info> stream_info,
+  host_span<orc_stream_info const> stream_info,
   cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
   cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
-  size_type num_stripes,
   size_type row_index_stride,
   bool use_base_stride,
   rmm::cuda_stream_view stream)
 {
-  // Parse the columns' compressed info
-  cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
-    0, stream_info.size(), stream);
-  for (auto const& info : stream_info) {
-    compinfo.push_back(gpu::CompressedStreamInfo(
-      static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
-      info.length));
-  }
-  compinfo.host_to_device_async(stream);
-
-  gpu::ParseCompressedStripeData(compinfo.device_ptr(),
-                                 compinfo.size(),
-                                 decompressor.GetBlockSize(),
-                                 decompressor.GetLog2MaxCompressionRatio(),
-                                 stream);
-  compinfo.device_to_host_sync(stream);
+  // Whether we have the comppression info precomputed.
+  auto const compinfo_ready = not compinfo_map.empty();
 
   // Count the exact number of compressed blocks
   std::size_t num_compressed_blocks   = 0;
   std::size_t num_uncompressed_blocks = 0;
   std::size_t total_decomp_size       = 0;
-  for (std::size_t i = 0; i < compinfo.size(); ++i) {
-    num_compressed_blocks += compinfo[i].num_compressed_blocks;
-    num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
-    total_decomp_size += compinfo[i].max_uncompressed_size;
+
+  for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
+    auto const& info = stream_info[stream_idx];
+
+    auto& stream_comp_info = compinfo[stream_idx - stream_range.begin];
+    stream_comp_info       = gpu::CompressedStreamInfo(
+      static_cast<uint8_t const*>(
+        stripe_data[info.source.stripe_idx - loaded_stripe_range.begin].data()) +
+        info.dst_pos,
+      info.length);
+
+    if (compinfo_ready) {
+      auto const& cached_comp_info             = compinfo_map.at(info.source);
+      stream_comp_info.num_compressed_blocks   = cached_comp_info.num_compressed_blocks;
+      stream_comp_info.num_uncompressed_blocks = cached_comp_info.num_uncompressed_blocks;
+      stream_comp_info.max_uncompressed_size   = cached_comp_info.total_decomp_size;
+
+      num_compressed_blocks += cached_comp_info.num_compressed_blocks;
+      num_uncompressed_blocks += cached_comp_info.num_uncompressed_blocks;
+      total_decomp_size += cached_comp_info.total_decomp_size;
+    }
   }
+
+  if (!compinfo_ready) {
+    compinfo.host_to_device_async(stream);
+    gpu::ParseCompressedStripeData(compinfo.device_ptr(),
+                                   compinfo.size(),
+                                   decompressor.GetBlockSize(),
+                                   decompressor.GetLog2MaxCompressionRatio(),
+                                   stream);
+    compinfo.device_to_host_sync(stream);
+
+    for (std::size_t i = 0; i < compinfo.size(); ++i) {
+      num_compressed_blocks += compinfo[i].num_compressed_blocks;
+      num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
+      total_decomp_size += compinfo[i].max_uncompressed_size;
+    }
+  }
+
   CUDF_EXPECTS(
     not((num_uncompressed_blocks + num_compressed_blocks > 0) and (total_decomp_size == 0)),
     "Inconsistent info on compression blocks");
 
-  // Buffer needs to be padded.
-  // Required by `gpuDecodeOrcColumnData`.
+  // Buffer needs to be padded.This is required by `gpuDecodeOrcColumnData`.
   rmm::device_buffer decomp_data(
     cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
+
+  // If total_decomp_size is zero, the input data may be just empty.
+  // This is still a valid input, thus do not be panick.
   if (decomp_data.is_empty()) { return decomp_data; }
 
   rmm::device_uvector<device_span<uint8_t const>> inflate_in(
@@ -221,7 +150,7 @@ rmm::device_buffer decompress_stripe_data(
   rmm::device_uvector<device_span<uint8_t>> inflate_out(
     num_compressed_blocks + num_uncompressed_blocks, stream);
   rmm::device_uvector<compression_result> inflate_res(num_compressed_blocks, stream);
-  thrust::fill(rmm::exec_policy(stream),
+  thrust::fill(rmm::exec_policy_nosync(stream),
                inflate_res.begin(),
                inflate_res.end(),
                compression_result{0, compression_status::FAILURE});
@@ -240,13 +169,13 @@ rmm::device_buffer decompress_stripe_data(
     compinfo[i].copy_in_ctl  = inflate_in.data() + start_pos_uncomp;
     compinfo[i].copy_out_ctl = inflate_out.data() + start_pos_uncomp;
 
-    stream_info[i].dst_pos = decomp_offset;
     decomp_offset += compinfo[i].max_uncompressed_size;
     start_pos += compinfo[i].num_compressed_blocks;
     start_pos_uncomp += compinfo[i].num_uncompressed_blocks;
     max_uncomp_block_size =
       std::max(max_uncomp_block_size, compinfo[i].max_uncompressed_block_size);
   }
+
   compinfo.host_to_device_async(stream);
   gpu::ParseCompressedStripeData(compinfo.device_ptr(),
                                  compinfo.size(),
@@ -325,7 +254,7 @@ rmm::device_buffer decompress_stripe_data(
     // Check if any block has been failed to decompress.
     // Not using `thrust::any` or `thrust::count_if` to defer stream sync.
     thrust::for_each(
-      rmm::exec_policy(stream),
+      rmm::exec_policy_nosync(stream),
       thrust::make_counting_iterator(std::size_t{0}),
       thrust::make_counting_iterator(inflate_res.size()),
       [results           = inflate_res.begin(),
@@ -351,15 +280,15 @@ rmm::device_buffer decompress_stripe_data(
   // We can check on host after stream synchronize
   CUDF_EXPECTS(not any_block_failure[0], "Error during decompression");
 
-  size_type const num_columns = chunks.size().second;
+  auto const num_columns = chunks.size().second;
 
   // Update the stream information with the updated uncompressed info
   // TBD: We could update the value from the information we already
   // have in stream_info[], but using the gpu results also updates
   // max_uncompressed_size to the actual uncompressed size, or zero if
   // decompression failed.
-  for (size_type i = 0; i < num_stripes; ++i) {
-    for (size_type j = 0; j < num_columns; ++j) {
+  for (std::size_t i = 0; i < num_decode_stripes; ++i) {
+    for (std::size_t j = 0; j < num_columns; ++j) {
       auto& chunk = chunks[i][j];
       for (int k = 0; k < gpu::CI_NUM_STREAMS; ++k) {
         if (chunk.strm_len[k] > 0 && chunk.strm_id[k] < compinfo.size()) {
@@ -377,7 +306,7 @@ rmm::device_buffer decompress_stripe_data(
                             compinfo.device_ptr(),
                             chunks.base_device_ptr(),
                             num_columns,
-                            num_stripes,
+                            num_decode_stripes,
                             row_index_stride,
                             use_base_stride,
                             stream);
@@ -424,7 +353,7 @@ void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks
       if (child_valid_map_base != nullptr) {
         rmm::device_uvector<uint32_t> dst_idx(child_mask_len, stream);
         // Copy indexes at which the parent has valid value.
-        thrust::copy_if(rmm::exec_policy(stream),
+        thrust::copy_if(rmm::exec_policy_nosync(stream),
                         thrust::make_counting_iterator(0),
                         thrust::make_counting_iterator(0) + parent_mask_len,
                         dst_idx.begin(),
@@ -438,7 +367,7 @@ void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks
         uint32_t* dst_idx_ptr = dst_idx.data();
         // Copy child valid bits from child column to valid indexes, this will merge both child
         // and parent null masks
-        thrust::for_each(rmm::exec_policy(stream),
+        thrust::for_each(rmm::exec_policy_nosync(stream),
                          thrust::make_counting_iterator(0),
                          thrust::make_counting_iterator(0) + dst_idx.size(),
                          [child_valid_map_base, dst_idx_ptr, merged_mask] __device__(auto idx) {
@@ -484,11 +413,11 @@ void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
-void decode_stream_data(std::size_t num_dicts,
+void decode_stream_data(int64_t num_dicts,
                         int64_t skip_rows,
                         size_type row_index_stride,
                         std::size_t level,
-                        table_view const& tz_table,
+                        table_device_view const& d_tz_table,
                         cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
                         cudf::detail::device_2dspan<gpu::RowGroup> row_groups,
                         std::vector<column_buffer>& out_buffers,
@@ -497,6 +426,7 @@ void decode_stream_data(std::size_t num_dicts,
 {
   auto const num_stripes = chunks.size().first;
   auto const num_columns = chunks.size().second;
+
   thrust::counting_iterator<int> col_idx_it(0);
   thrust::counting_iterator<int> stripe_idx_it(0);
 
@@ -512,7 +442,7 @@ void decode_stream_data(std::size_t num_dicts,
   // Allocate global dictionary for deserializing
   rmm::device_uvector<gpu::DictionaryEntry> global_dict(num_dicts, stream);
 
-  chunks.host_to_device_sync(stream);
+  chunks.host_to_device_async(stream);
   gpu::DecodeNullsAndStringDictionaries(
     chunks.base_device_ptr(), global_dict.data(), num_columns, num_stripes, skip_rows, stream);
 
@@ -521,16 +451,14 @@ void decode_stream_data(std::size_t num_dicts,
     update_null_mask(chunks, out_buffers, stream, mr);
   }
 
-  auto const tz_table_dptr = table_device_view::create(tz_table, stream);
   rmm::device_scalar<size_type> error_count(0, stream);
-  // Update the null map for child columns
   gpu::DecodeOrcColumnData(chunks.base_device_ptr(),
                            global_dict.data(),
                            row_groups,
                            num_columns,
                            num_stripes,
                            skip_rows,
-                           *tz_table_dptr,
+                           d_tz_table,
                            row_groups.size().first,
                            row_index_stride,
                            level,
@@ -557,40 +485,38 @@ void decode_stream_data(std::size_t num_dicts,
  * layer.
  */
 void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const& chunks,
-                      cudf::host_span<rmm::device_uvector<uint32_t>> prefix_sums,
+                      uint32_t* d_prefix_sums,
                       rmm::cuda_stream_view stream)
 {
   auto const num_stripes = chunks.size().first;
   if (num_stripes == 0) return;
 
   auto const num_columns = chunks.size().second;
-  std::vector<thrust::pair<size_type, cudf::device_span<uint32_t>>> prefix_sums_to_update;
+  std::vector<thrust::pair<size_type, uint32_t*>> prefix_sums_to_update;
   for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) {
     // Null counts sums are only needed for children of struct columns
     if (chunks[0][col_idx].type_kind == STRUCT) {
-      prefix_sums_to_update.emplace_back(col_idx, prefix_sums[col_idx]);
+      prefix_sums_to_update.emplace_back(col_idx, d_prefix_sums + num_stripes * col_idx);
     }
   }
   auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
     prefix_sums_to_update, stream, rmm::mr::get_current_device_resource());
 
-  thrust::for_each(rmm::exec_policy(stream),
-                   d_prefix_sums_to_update.begin(),
-                   d_prefix_sums_to_update.end(),
-                   [chunks = cudf::detail::device_2dspan<gpu::ColumnDesc const>{chunks}] __device__(
-                     auto const& idx_psums) {
-                     auto const col_idx = idx_psums.first;
-                     auto const psums   = idx_psums.second;
-
-                     thrust::transform(
-                       thrust::seq,
-                       thrust::make_counting_iterator(0),
-                       thrust::make_counting_iterator(0) + psums.size(),
-                       psums.begin(),
-                       [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
-
-                     thrust::inclusive_scan(thrust::seq, psums.begin(), psums.end(), psums.begin());
-                   });
+  thrust::for_each(
+    rmm::exec_policy_nosync(stream),
+    d_prefix_sums_to_update.begin(),
+    d_prefix_sums_to_update.end(),
+    [num_stripes, chunks = cudf::detail::device_2dspan<gpu::ColumnDesc const>{chunks}] __device__(
+      auto const& idx_psums) {
+      auto const col_idx = idx_psums.first;
+      auto const psums   = idx_psums.second;
+      thrust::transform(thrust::seq,
+                        thrust::make_counting_iterator<std::size_t>(0ul),
+                        thrust::make_counting_iterator<std::size_t>(num_stripes),
+                        psums,
+                        [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
+      thrust::inclusive_scan(thrust::seq, psums, psums + num_stripes, psums);
+    });
   // `prefix_sums_to_update` goes out of scope, copy has to be done before we return
   stream.synchronize();
 }
@@ -634,6 +560,7 @@ void aggregate_child_meta(std::size_t level,
   // For each parent column, update its child column meta for each stripe.
   std::for_each(nested_cols.begin(), nested_cols.end(), [&](auto const p_col) {
     auto const parent_col_idx = col_meta.orc_col_map[level][p_col.id];
+
     int64_t start_row         = 0;
     auto processed_row_groups = 0;
 
@@ -657,10 +584,19 @@ void aggregate_child_meta(std::size_t level,
 
       // Aggregate start row, number of rows per chunk and total number of rows in a column
       auto const child_rows = chunks[stripe_id][parent_col_idx].num_child_rows;
+
       for (size_type id = 0; id < p_col.num_children; id++) {
         auto const child_col_idx = index + id;
 
         num_child_rows[child_col_idx] += child_rows;
+
+        // The number of rows in child column should not be very large otherwise we will have
+        // size overflow.
+        // If that is the case, we need to set a read limit to reduce number of decoding stripes.
+        CUDF_EXPECTS(num_child_rows[child_col_idx] <=
+                       static_cast<int64_t>(std::numeric_limits<size_type>::max()),
+                     "Number of rows in the child column exceeds column size limit.");
+
         num_child_rows_per_stripe[stripe_id][child_col_idx] = child_rows;
         // start row could be different for each column when there is nesting at each stripe level
         child_start_row[stripe_id][child_col_idx] = (stripe_id == 0) ? 0 : start_row;
@@ -709,264 +645,291 @@ void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_
   }
 }
 
+/**
+ * @brief Find the splits of the input table such that each split range of rows has data size less
+ * than a given `size_limit`.
+ *
+ * The parameter `segment_length` is to control the granularity of splits. The output ranges will
+ * always have numbers of rows that are multiple of this value, except the last range that contains
+ * the remaining rows.
+ *
+ * Similar to `find_splits`, the given limit is just a soft limit. This function will never output
+ * empty ranges, even they have sizes exceed the value of `size_limit`.
+ *
+ * @param input The input table to find splits
+ * @param segment_length Value to control granularity of the output ranges
+ * @param size_limit A limit on the output size of each split range
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return A vector of ranges as splits of the input
+ */
+std::vector<range> find_table_splits(table_view const& input,
+                                     size_type segment_length,
+                                     std::size_t size_limit,
+                                     rmm::cuda_stream_view stream)
+{
+  if (size_limit == 0) {
+    return std::vector<range>{range{0, static_cast<std::size_t>(input.num_rows())}};
+  }
+
+  CUDF_EXPECTS(segment_length > 0, "Invalid segment_length", std::invalid_argument);
+
+  // `segmented_row_bit_count` requires that `segment_length` is not larger than number of rows.
+  segment_length = std::min(segment_length, input.num_rows());
+
+  auto const d_segmented_sizes = cudf::detail::segmented_row_bit_count(
+    input, segment_length, stream, rmm::mr::get_current_device_resource());
+
+  auto segmented_sizes =
+    cudf::detail::hostdevice_vector<cumulative_size>(d_segmented_sizes->size(), stream);
+
+  thrust::transform(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(d_segmented_sizes->size()),
+    segmented_sizes.d_begin(),
+    [segment_length,
+     num_rows = input.num_rows(),
+     d_sizes  = d_segmented_sizes->view().begin<size_type>()] __device__(auto const segment_idx) {
+      // Since the number of rows may not divisible by segment_length,
+      // the last segment may be shorter than the others.
+      auto const current_length =
+        cuda::std::min(segment_length, num_rows - segment_length * segment_idx);
+      auto const size = d_sizes[segment_idx] / CHAR_BIT;  // divide by CHAR_BIT to get size in bytes
+      return cumulative_size{static_cast<std::size_t>(current_length),
+                             static_cast<std::size_t>(size)};
+    });
+
+  thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
+                         segmented_sizes.d_begin(),
+                         segmented_sizes.d_end(),
+                         segmented_sizes.d_begin(),
+                         cumulative_size_plus{});
+  segmented_sizes.device_to_host_sync(stream);
+
+  return find_splits<cumulative_size>(segmented_sizes, input.num_rows(), size_limit);
+}
+
 }  // namespace
 
-void reader::impl::prepare_data(int64_t skip_rows,
-                                std::optional<size_type> const& num_rows_opt,
-                                std::vector<std::vector<size_type>> const& stripes)
+void reader_impl::decompress_and_decode_stripes(read_mode mode)
 {
-  // Selected columns at different levels of nesting are stored in different elements
-  // of `selected_columns`; thus, size == 1 means no nested columns
-  CUDF_EXPECTS(skip_rows == 0 or _selected_columns.num_levels() == 1,
-               "skip_rows is not supported by nested columns");
-
-  // There are no columns in the table
-  if (_selected_columns.num_levels() == 0) { return; }
-
-  _file_itm_data = std::make_unique<file_intermediate_data>();
-
-  // Select only stripes required (aka row groups)
-  std::tie(
-    _file_itm_data->rows_to_skip, _file_itm_data->rows_to_read, _file_itm_data->selected_stripes) =
-    _metadata.select_stripes(stripes, skip_rows, num_rows_opt, _stream);
-  auto const rows_to_skip      = _file_itm_data->rows_to_skip;
-  auto const rows_to_read      = _file_itm_data->rows_to_read;
-  auto const& selected_stripes = _file_itm_data->selected_stripes;
-
-  // If no rows or stripes to read, return empty columns
-  if (rows_to_read == 0 || selected_stripes.empty()) { return; }
-
-  // Set up table for converting timestamp columns from local to UTC time
-  auto const tz_table = [&, &selected_stripes = selected_stripes] {
-    auto const has_timestamp_column = std::any_of(
-      _selected_columns.levels.cbegin(), _selected_columns.levels.cend(), [&](auto const& col_lvl) {
-        return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto const& col_meta) {
-          return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
-        });
-      });
+  if (!_file_itm_data.has_data()) { return; }
+
+  CUDF_EXPECTS(_chunk_read_data.curr_load_stripe_range > 0, "There is not any stripe loaded.");
+
+  auto const stripe_range =
+    _chunk_read_data.decode_stripe_ranges[_chunk_read_data.curr_decode_stripe_range++];
+  auto const stripe_start = stripe_range.begin;
+  auto const stripe_end   = stripe_range.end;
+  auto const stripe_count = stripe_range.size();
+
+  // The start index of loaded stripes. They are different from decoding stripes.
+  auto const load_stripe_range =
+    _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range - 1];
+  auto const load_stripe_start = load_stripe_range.begin;
+
+  auto const rows_to_skip      = _file_itm_data.rows_to_skip;
+  auto const& selected_stripes = _file_itm_data.selected_stripes;
+
+  // Number of rows to decode in this decompressing/decoding step.
+  int64_t rows_to_decode = 0;
+  for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
+    auto const& stripe     = selected_stripes[stripe_idx];
+    auto const stripe_rows = static_cast<int64_t>(stripe.stripe_info->numberOfRows);
+    rows_to_decode += stripe_rows;
+  }
 
-    return has_timestamp_column
-             ? cudf::detail::make_timezone_transition_table(
-                 {}, selected_stripes[0].stripe_info[0].second->writerTimezone, _stream)
-             : std::make_unique<cudf::table>();
+  CUDF_EXPECTS(rows_to_decode > rows_to_skip, "Invalid rows_to_decode computation.");
+  rows_to_decode = std::min<int64_t>(rows_to_decode - rows_to_skip, _file_itm_data.rows_to_read);
+
+  // After this step, we no longer have any rows to skip.
+  // The number of rows remains to read in the future also reduced.
+  _file_itm_data.rows_to_skip = 0;
+  _file_itm_data.rows_to_read -= rows_to_decode;
+
+  // Technically, overflow here should never happen because the `load_next_stripe_data()` step
+  // already handled it by splitting the loaded stripe range into multiple decode ranges.
+  CUDF_EXPECTS(rows_to_decode <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
+               "Number or rows to decode exceeds the column size limit.",
+               std::overflow_error);
+
+  auto const tz_table_dptr = table_device_view::create(_file_itm_data.tz_table->view(), _stream);
+  auto const num_levels    = _selected_columns.num_levels();
+  _out_buffers.resize(num_levels);
+
+  // Column descriptors ('chunks').
+  // Each 'chunk' of data here corresponds to an orc column, in a stripe, at a nested level.
+  // Unfortunately we cannot create one hostdevice_vector to use for all levels because
+  // currently we do not have a hostdevice_2dspan class.
+  std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_chunks(num_levels);
+
+  // For computing null count.
+  auto null_count_prefix_sums = [&] {
+    auto const num_total_cols = std::accumulate(
+      _selected_columns.levels.begin(),
+      _selected_columns.levels.end(),
+      std::size_t{0},
+      [](auto const& sum, auto const& cols_level) { return sum + cols_level.size(); });
+
+    return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
+      num_total_cols * stripe_count, _stream, rmm::mr::get_current_device_resource());
+  }();
+  std::size_t num_processed_lvl_columns      = 0;
+  std::size_t num_processed_prev_lvl_columns = 0;
+
+  // For parsing decompression data.
+  // We create one hostdevice_vector that is large enough to use for all levels,
+  // thus only need to allocate memory once.
+  auto hd_compinfo = [&] {
+    std::size_t max_num_streams{0};
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      // Find the maximum number of streams in all levels of the decoding stripes.
+      for (std::size_t level = 0; level < num_levels; ++level) {
+        auto const stream_range =
+          merge_selected_ranges(_file_itm_data.lvl_stripe_stream_ranges[level], stripe_range);
+        max_num_streams = std::max(max_num_streams, stream_range.size());
+      }
+    }
+    return cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo>{max_num_streams, _stream};
   }();
 
-  auto& lvl_stripe_data        = _file_itm_data->lvl_stripe_data;
-  auto& null_count_prefix_sums = _file_itm_data->null_count_prefix_sums;
-  lvl_stripe_data.resize(_selected_columns.num_levels());
-
-  _out_buffers.resize(_selected_columns.num_levels());
-
-  // Iterates through levels of nested columns, child column will be one level down
-  // compared to parent column.
   auto& col_meta = *_col_meta;
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-    auto& columns_level = _selected_columns.levels[level];
-    // Association between each ORC column and its cudf::column
-    col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
-    std::vector<orc_column_meta> nested_cols;
-
-    // Get a list of column data types
-    std::vector<data_type> column_types;
-    for (auto& col : columns_level) {
-      auto col_type = to_cudf_type(_metadata.get_col_type(col.id).kind,
-                                   _use_np_dtypes,
-                                   _timestamp_type.id(),
-                                   to_cudf_decimal_type(_decimal128_columns, _metadata, col.id));
-      CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
-      if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
-          col_type == type_id::DECIMAL128) {
-        // sign of the scale is changed since cuDF follows c++ libraries like CNL
-        // which uses negative scaling, but liborc and other libraries
-        // follow positive scaling.
-        auto const scale =
-          -static_cast<size_type>(_metadata.get_col_type(col.id).scale.value_or(0));
-        column_types.emplace_back(col_type, scale);
-      } else {
-        column_types.emplace_back(col_type);
-      }
+    auto const& stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges[level];
+    auto const stream_range          = merge_selected_ranges(stripe_stream_ranges, stripe_range);
 
-      // Map each ORC column to its column
-      col_meta.orc_col_map[level][col.id] = column_types.size() - 1;
-      if (col_type == type_id::LIST or col_type == type_id::STRUCT) {
-        nested_cols.emplace_back(col);
-      }
-    }
+    auto const& columns_level = _selected_columns.levels[level];
+    auto const& stream_info   = _file_itm_data.lvl_stream_info[level];
+    auto const& column_types  = _file_itm_data.lvl_column_types[level];
+    auto const& nested_cols   = _file_itm_data.lvl_nested_cols[level];
 
-    // Get the total number of stripes across all input files.
-    std::size_t total_num_stripes =
-      std::accumulate(selected_stripes.begin(),
-                      selected_stripes.end(),
-                      0,
-                      [](std::size_t sum, auto& stripe_source_mapping) {
-                        return sum + stripe_source_mapping.stripe_info.size();
-                      });
-    auto const num_columns = columns_level.size();
-    cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> chunks(
-      total_num_stripes, num_columns, _stream);
+    auto& stripe_data = _file_itm_data.lvl_stripe_data[level];
+    auto& chunks      = lvl_chunks[level];
+
+    auto const num_lvl_columns = columns_level.size();
+    chunks =
+      cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(stripe_count, num_lvl_columns, _stream);
     memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
 
     const bool use_index =
-      _use_index &&
+      _options.use_index &&
       // Do stripes have row group index
       _metadata.is_row_grp_idx_present() &&
       // Only use if we don't have much work with complete columns & stripes
       // TODO: Consider nrows, gpu, and tune the threshold
-      (rows_to_read > _metadata.get_row_index_stride() && !(_metadata.get_row_index_stride() & 7) &&
-       _metadata.get_row_index_stride() != 0 && num_columns * total_num_stripes < 8 * 128) &&
+      (rows_to_decode > _metadata.get_row_index_stride() &&
+       !(_metadata.get_row_index_stride() & 7) && _metadata.get_row_index_stride() != 0 &&
+       num_lvl_columns * stripe_count < 8 * 128) &&
       // Only use if first row is aligned to a stripe boundary
       // TODO: Fix logic to handle unaligned rows
       (rows_to_skip == 0);
 
-    // Logically view streams as columns
-    std::vector<orc_stream_info> stream_info;
-
-    null_count_prefix_sums.emplace_back();
-    null_count_prefix_sums.back().reserve(_selected_columns.levels[level].size());
-    std::generate_n(std::back_inserter(null_count_prefix_sums.back()),
-                    _selected_columns.levels[level].size(),
-                    [&]() {
-                      return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                        total_num_stripes, _stream, rmm::mr::get_current_device_resource());
-                    });
-
-    // Tracker for eventually deallocating compressed and uncompressed data
-    auto& stripe_data = lvl_stripe_data[level];
-
-    int64_t stripe_start_row = 0;
-    int64_t num_dict_entries = 0;
-    int64_t num_rowgroups    = 0;
-    size_type stripe_idx     = 0;
-
-    std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
-    for (auto const& stripe_source_mapping : selected_stripes) {
-      // Iterate through the source files selected stripes
-      for (auto const& stripe : stripe_source_mapping.stripe_info) {
-        auto const stripe_info   = stripe.first;
-        auto const stripe_footer = stripe.second;
-
-        auto stream_count          = stream_info.size();
-        auto const total_data_size = gather_stream_info(stripe_idx,
-                                                        stripe_info,
-                                                        stripe_footer,
-                                                        col_meta.orc_col_map[level],
-                                                        _metadata.get_types(),
-                                                        use_index,
-                                                        level == 0,
-                                                        &num_dict_entries,
-                                                        stream_info,
-                                                        chunks);
-
-        auto const is_stripe_data_empty = total_data_size == 0;
-        CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
-                     "Invalid index rowgroup stream data");
-
-        // Buffer needs to be padded.
-        // Required by `copy_uncompressed_kernel`.
-        stripe_data.emplace_back(
-          cudf::util::round_up_safe(total_data_size, BUFFER_PADDING_MULTIPLE), _stream);
-        auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
-
-        // Coalesce consecutive streams into one read
-        while (not is_stripe_data_empty and stream_count < stream_info.size()) {
-          auto const d_dst  = dst_base + stream_info[stream_count].dst_pos;
-          auto const offset = stream_info[stream_count].offset;
-          auto len          = stream_info[stream_count].length;
-          stream_count++;
-
-          while (stream_count < stream_info.size() &&
-                 stream_info[stream_count].offset == offset + len) {
-            len += stream_info[stream_count].length;
-            stream_count++;
-          }
-          if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                .source->is_device_read_preferred(len)) {
-            read_tasks.push_back(
-              std::pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                          .source->device_read_async(offset, len, d_dst, _stream),
-                        len));
-
-          } else {
-            auto const buffer =
-              _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
-                offset, len);
-            CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
-            CUDF_CUDA_TRY(
-              cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyDefault, _stream.value()));
-            _stream.synchronize();
-          }
-        }
-
-        auto const num_rows_per_stripe = stripe_info->numberOfRows;
-        auto const rowgroup_id         = num_rowgroups;
-        auto stripe_num_rowgroups      = 0;
-        if (use_index) {
-          stripe_num_rowgroups = (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
-                                 _metadata.get_row_index_stride();
+    // 0-based counters, used across all decoding stripes in this step.
+    int64_t stripe_start_row{0};
+    int64_t num_dict_entries{0};
+    uint32_t num_rowgroups{0};
+    std::size_t local_stream_order{0};
+
+    for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
+      auto const& stripe       = selected_stripes[stripe_idx];
+      auto const stripe_info   = stripe.stripe_info;
+      auto const stripe_footer = stripe.stripe_footer;
+
+      // Normalize stripe_idx to 0-based.
+      auto const stripe_local_idx = stripe_idx - stripe_start;
+
+      // The first parameter (`stripe_order`) must be normalized to 0-based.
+      auto const total_data_size = gather_stream_info_and_column_desc(stripe_local_idx,
+                                                                      level,
+                                                                      stripe_info,
+                                                                      stripe_footer,
+                                                                      col_meta.orc_col_map[level],
+                                                                      _metadata.get_types(),
+                                                                      use_index,
+                                                                      level == 0,
+                                                                      &num_dict_entries,
+                                                                      &local_stream_order,
+                                                                      nullptr,  // stream_info
+                                                                      &chunks);
+
+      auto const is_stripe_data_empty = total_data_size == 0;
+      CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
+                   "Invalid index rowgroup stream data");
+
+      auto const dst_base =
+        static_cast<uint8_t*>(stripe_data[stripe_idx - load_stripe_start].data());
+      auto const num_rows_in_stripe = static_cast<int64_t>(stripe_info->numberOfRows);
+
+      uint32_t const rowgroup_id = num_rowgroups;
+      uint32_t const stripe_num_rowgroups =
+        use_index ? (num_rows_in_stripe + _metadata.get_row_index_stride() - 1) /
+                      _metadata.get_row_index_stride()
+                  : 0;
+
+      // Update chunks to reference streams pointers.
+      for (std::size_t col_idx = 0; col_idx < num_lvl_columns; col_idx++) {
+        auto& chunk = chunks[stripe_local_idx][col_idx];
+        // start row, number of rows in a each stripe and total number of rows
+        // may change in lower levels of nesting
+        chunk.start_row =
+          (level == 0) ? stripe_start_row
+                       : col_meta.child_start_row[stripe_local_idx * num_lvl_columns + col_idx];
+        chunk.num_rows =
+          (level == 0)
+            ? num_rows_in_stripe
+            : col_meta.num_child_rows_per_stripe[stripe_local_idx * num_lvl_columns + col_idx];
+        chunk.column_num_rows = (level == 0) ? rows_to_decode : col_meta.num_child_rows[col_idx];
+        chunk.parent_validity_info =
+          (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
+        chunk.parent_null_count_prefix_sums =
+          (level == 0) ? nullptr
+                       : null_count_prefix_sums.data() + (num_processed_prev_lvl_columns +
+                                                          col_meta.parent_column_index[col_idx]) *
+                                                           stripe_count;
+        chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
+        chunk.type_kind =
+          _metadata.per_file_metadata[stripe.source_idx].ff.types[columns_level[col_idx].id].kind;
+
+        // num_child_rows for a struct column will be same, for other nested types it will be
+        // calculated.
+        chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
+        chunk.dtype_id       = column_types[col_idx].id();
+        chunk.decimal_scale  = _metadata.per_file_metadata[stripe.source_idx]
+                                .ff.types[columns_level[col_idx].id]
+                                .scale.value_or(0);
+
+        chunk.rowgroup_id   = rowgroup_id;
+        chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
+                                ? sizeof(string_index_pair)
+                              : ((column_types[col_idx].id() == type_id::LIST) or
+                             (column_types[col_idx].id() == type_id::STRUCT))
+                                ? sizeof(size_type)
+                                : cudf::size_of(column_types[col_idx]);
+        chunk.num_rowgroups = stripe_num_rowgroups;
+
+        if (chunk.type_kind == orc::TIMESTAMP) {
+          chunk.timestamp_type_id = _options.timestamp_type.id();
         }
-        // Update chunks to reference streams pointers
-        for (std::size_t col_idx = 0; col_idx < num_columns; col_idx++) {
-          auto& chunk = chunks[stripe_idx][col_idx];
-          // start row, number of rows in a each stripe and total number of rows
-          // may change in lower levels of nesting
-          chunk.start_row = (level == 0)
-                              ? stripe_start_row
-                              : col_meta.child_start_row[stripe_idx * num_columns + col_idx];
-          chunk.num_rows =
-            (level == 0) ? stripe_info->numberOfRows
-                         : col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx];
-          chunk.column_num_rows = (level == 0) ? rows_to_read : col_meta.num_child_rows[col_idx];
-          chunk.parent_validity_info =
-            (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
-          chunk.parent_null_count_prefix_sums =
-            (level == 0)
-              ? nullptr
-              : null_count_prefix_sums[level - 1][col_meta.parent_column_index[col_idx]].data();
-          chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
-          chunk.type_kind     = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                              .ff.types[columns_level[col_idx].id]
-                              .kind;
-          // num_child_rows for a struct column will be same, for other nested types it will be
-          // calculated.
-          chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
-          chunk.dtype_id       = column_types[col_idx].id();
-          chunk.decimal_scale  = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                                  .ff.types[columns_level[col_idx].id]
-                                  .scale.value_or(0);
-
-          chunk.rowgroup_id   = rowgroup_id;
-          chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
-                                  ? sizeof(string_index_pair)
-                                : ((column_types[col_idx].id() == type_id::LIST) or
-                               (column_types[col_idx].id() == type_id::STRUCT))
-                                  ? sizeof(size_type)
-                                  : cudf::size_of(column_types[col_idx]);
-          chunk.num_rowgroups = stripe_num_rowgroups;
-          if (chunk.type_kind == orc::TIMESTAMP) { chunk.timestamp_type_id = _timestamp_type.id(); }
-          if (not is_stripe_data_empty) {
-            for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
-              chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
-            }
+        if (not is_stripe_data_empty) {
+          for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
+            chunk.streams[k] =
+              dst_base + stream_info[chunk.strm_id[k] + stream_range.begin].dst_pos;
           }
         }
-        stripe_start_row += num_rows_per_stripe;
-        num_rowgroups += stripe_num_rowgroups;
-
-        stripe_idx++;
       }
-    }
-    for (auto& task : read_tasks) {
-      CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
+
+      stripe_start_row += num_rows_in_stripe;
+      num_rowgroups += stripe_num_rowgroups;
     }
 
     if (stripe_data.empty()) { continue; }
 
-    // Process dataset chunk pages into output columns
+    // Process dataset chunks into output columns.
     auto row_groups =
-      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_columns, _stream);
+      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_lvl_columns, _stream);
     if (level > 0 and row_groups.size().first) {
       cudf::host_span<gpu::RowGroup> row_groups_span(row_groups.base_host_ptr(),
-                                                     num_rowgroups * num_columns);
+                                                     num_rowgroups * num_lvl_columns);
       auto& rw_grp_meta = col_meta.rwgrp_meta;
 
       // Update start row and num rows per row group
@@ -980,19 +943,31 @@ void reader::impl::prepare_data(int64_t skip_rows,
                        return meta;
                      });
     }
-    // Setup row group descriptors if using indexes
+
+    // Setup row group descriptors if using indexes.
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-      auto decomp_data = decompress_stripe_data(*_metadata.per_file_metadata[0].decompressor,
+      auto compinfo = cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>(
+        hd_compinfo.begin(), hd_compinfo.d_begin(), stream_range.size());
+      auto decomp_data = decompress_stripe_data(load_stripe_range,
+                                                stream_range,
+                                                stripe_count,
+                                                compinfo,
+                                                _file_itm_data.compinfo_map,
+                                                *_metadata.per_file_metadata[0].decompressor,
                                                 stripe_data,
                                                 stream_info,
                                                 chunks,
                                                 row_groups,
-                                                total_num_stripes,
                                                 _metadata.get_row_index_stride(),
                                                 level == 0,
                                                 _stream);
-      stripe_data.clear();
-      stripe_data.push_back(std::move(decomp_data));
+
+      // Just save the decompressed data and clear out the raw data to free up memory.
+      stripe_data[stripe_start - load_stripe_start] = std::move(decomp_data);
+      for (std::size_t i = 1; i < stripe_count; ++i) {
+        stripe_data[i + stripe_start - load_stripe_start] = {};
+      }
+
     } else {
       if (row_groups.size().first) {
         chunks.host_to_device_async(_stream);
@@ -1001,34 +976,38 @@ void reader::impl::prepare_data(int64_t skip_rows,
         gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
                                 nullptr,
                                 chunks.base_device_ptr(),
-                                num_columns,
-                                total_num_stripes,
+                                num_lvl_columns,
+                                stripe_count,
                                 _metadata.get_row_index_stride(),
                                 level == 0,
                                 _stream);
       }
     }
 
+    _out_buffers[level].resize(0);
+
     for (std::size_t i = 0; i < column_types.size(); ++i) {
       bool is_nullable = false;
-      for (std::size_t j = 0; j < total_num_stripes; ++j) {
+      for (std::size_t j = 0; j < stripe_count; ++j) {
         if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
           is_nullable = true;
           break;
         }
       }
-      auto is_list_type = (column_types[i].id() == type_id::LIST);
-      auto n_rows       = (level == 0) ? rows_to_read : col_meta.num_child_rows[i];
-      // For list column, offset column will be always size + 1
-      if (is_list_type) n_rows++;
-      _out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, _stream, _mr);
+
+      auto const is_list_type = (column_types[i].id() == type_id::LIST);
+      auto const n_rows       = (level == 0) ? rows_to_decode : col_meta.num_child_rows[i];
+
+      // For list column, offset column will be always size + 1.
+      _out_buffers[level].emplace_back(
+        column_types[i], is_list_type ? n_rows + 1 : n_rows, is_nullable, _stream, _mr);
     }
 
     decode_stream_data(num_dict_entries,
                        rows_to_skip,
                        _metadata.get_row_index_stride(),
                        level,
-                       tz_table->view(),
+                       *tz_table_dptr,
                        chunks,
                        row_groups,
                        _out_buffers[level],
@@ -1036,8 +1015,9 @@ void reader::impl::prepare_data(int64_t skip_rows,
                        _mr);
 
     if (nested_cols.size()) {
-      // Extract information to process nested child columns
-      scan_null_counts(chunks, null_count_prefix_sums[level], _stream);
+      // Extract information to process nested child columns.
+      scan_null_counts(
+        chunks, null_count_prefix_sums.data() + num_processed_lvl_columns * stripe_count, _stream);
 
       row_groups.device_to_host_sync(_stream);
       aggregate_child_meta(
@@ -1055,7 +1035,48 @@ void reader::impl::prepare_data(int64_t skip_rows,
 
       if (not buff_data.empty()) { generate_offsets_for_list(buff_data, _stream); }
     }
+    num_processed_prev_lvl_columns = num_processed_lvl_columns;
+    num_processed_lvl_columns += num_lvl_columns;
   }  // end loop level
+
+  // Now generate a table from the decoded result.
+  std::vector<std::unique_ptr<column>> out_columns;
+  _out_metadata = get_meta_with_user_data();
+  std::transform(
+    _selected_columns.levels[0].begin(),
+    _selected_columns.levels[0].end(),
+    std::back_inserter(out_columns),
+    [&](auto const& orc_col_meta) {
+      _out_metadata.schema_info.emplace_back("");
+      auto col_buffer = assemble_buffer(
+        orc_col_meta.id, 0, *_col_meta, _metadata, _selected_columns, _out_buffers, _stream, _mr);
+      return make_column(col_buffer, &_out_metadata.schema_info.back(), std::nullopt, _stream);
+    });
+  _chunk_read_data.decoded_table = std::make_unique<table>(std::move(out_columns));
+
+  // Free up temp memory used for decoding.
+  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    _out_buffers[level].resize(0);
+
+    auto& stripe_data = _file_itm_data.lvl_stripe_data[level];
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      stripe_data[stripe_start - load_stripe_start] = {};
+    } else {
+      for (std::size_t i = 0; i < stripe_count; ++i) {
+        stripe_data[i + stripe_start - load_stripe_start] = {};
+      }
+    }
+  }
+
+  // Output table range is reset to start from the first position.
+  _chunk_read_data.curr_output_table_range = 0;
+
+  // Split the decoded table into ranges that be output into chunks having size within the given
+  // output size limit.
+  _chunk_read_data.output_table_ranges = find_table_splits(_chunk_read_data.decoded_table->view(),
+                                                           _chunk_read_data.output_row_granularity,
+                                                           _chunk_read_data.chunk_read_limit,
+                                                           _stream);
 }
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_helpers.hpp b/cpp/src/io/orc/reader_impl_helpers.hpp
index 6645eecbd29..a563fb19e15 100644
--- a/cpp/src/io/orc/reader_impl_helpers.hpp
+++ b/cpp/src/io/orc/reader_impl_helpers.hpp
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "aggregate_orc_metadata.hpp"
+#include "io/orc/aggregate_orc_metadata.hpp"
+#include "io/orc/orc.hpp"
 #include "io/utilities/column_buffer.hpp"
-#include "orc.hpp"
 
 #include <cudf/io/orc.hpp>
 
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index c7659be1adb..c47beb8d7ed 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -650,7 +650,10 @@ aggregate_reader_metadata::select_row_groups(
     if (not row_group_indices.empty()) { return std::pair<int64_t, size_type>{}; }
     auto const from_opts = cudf::io::detail::skip_rows_num_rows_from_options(
       skip_rows_opt, num_rows_opt, get_num_rows());
-    return std::pair{static_cast<int64_t>(from_opts.first), from_opts.second};
+    CUDF_EXPECTS(from_opts.second <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
+                 "Number of reading rows exceeds cudf's column size limit.");
+    return std::pair{static_cast<int64_t>(from_opts.first),
+                     static_cast<size_type>(from_opts.second)};
   }();
 
   if (!row_group_indices.empty()) {
diff --git a/cpp/src/io/utilities/row_selection.cpp b/cpp/src/io/utilities/row_selection.cpp
index f136cd11ff7..c0bbca39167 100644
--- a/cpp/src/io/utilities/row_selection.cpp
+++ b/cpp/src/io/utilities/row_selection.cpp
@@ -23,20 +23,17 @@
 
 namespace cudf::io::detail {
 
-std::pair<int64_t, size_type> skip_rows_num_rows_from_options(
-  int64_t skip_rows, std::optional<size_type> const& num_rows, int64_t num_source_rows)
+std::pair<int64_t, int64_t> skip_rows_num_rows_from_options(int64_t skip_rows,
+                                                            std::optional<int64_t> const& num_rows,
+                                                            int64_t num_source_rows)
 {
-  auto const rows_to_skip = std::min(skip_rows, num_source_rows);
-  if (not num_rows.has_value()) {
-    CUDF_EXPECTS(num_source_rows - rows_to_skip <= std::numeric_limits<size_type>::max(),
-                 "The requested number of rows exceeds the column size limit",
-                 std::overflow_error);
-    return {rows_to_skip, num_source_rows - rows_to_skip};
-  }
+  auto const rows_to_skip      = std::min(skip_rows, num_source_rows);
+  auto const num_rows_can_read = num_source_rows - rows_to_skip;
+
+  if (not num_rows.has_value()) { return {rows_to_skip, num_rows_can_read}; }
+
   // Limit the number of rows to the end of the input
-  return {
-    rows_to_skip,
-    static_cast<size_type>(std::min<int64_t>(num_rows.value(), num_source_rows - rows_to_skip))};
+  return {rows_to_skip, std::min(num_rows.value(), num_rows_can_read)};
 }
 
 }  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/row_selection.hpp b/cpp/src/io/utilities/row_selection.hpp
index 0b5d3aef8bd..7fdcc65d77b 100644
--- a/cpp/src/io/utilities/row_selection.hpp
+++ b/cpp/src/io/utilities/row_selection.hpp
@@ -34,7 +34,8 @@ namespace cudf::io::detail {
  *
  * @throw std::overflow_exception The requested number of rows exceeds the column size limit
  */
-std::pair<int64_t, size_type> skip_rows_num_rows_from_options(
-  int64_t skip_rows, std::optional<size_type> const& num_rows, int64_t num_source_rows);
+std::pair<int64_t, int64_t> skip_rows_num_rows_from_options(int64_t skip_rows,
+                                                            std::optional<int64_t> const& num_rows,
+                                                            int64_t num_source_rows);
 
 }  // namespace cudf::io::detail
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 6c56d82007a..fa633dfa67b 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -294,7 +294,7 @@ ConfigureTest(
   PERCENT 30
 )
 ConfigureTest(
-  ORC_TEST io/orc_test.cpp
+  ORC_TEST io/orc_chunked_reader_test.cu io/orc_test.cpp
   GPUS 1
   PERCENT 30
 )
diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
new file mode 100644
index 00000000000..1c1b53ea17f
--- /dev/null
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -0,0 +1,1477 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/io_metadata_utilities.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/concatenate.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/structs/utilities.hpp>
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/io/data_sink.hpp>
+#include <cudf/io/datasource.hpp>
+#include <cudf/io/orc.hpp>
+#include <cudf/io/orc_metadata.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+namespace {
+enum class output_limit : std::size_t {};
+enum class input_limit : std::size_t {};
+enum class output_row_granularity : cudf::size_type {};
+
+// Global environment for temporary files
+auto const temp_env = reinterpret_cast<cudf::test::TempDirTestEnvironment*>(
+  ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
+
+using int32s_col       = cudf::test::fixed_width_column_wrapper<int32_t>;
+using int64s_col       = cudf::test::fixed_width_column_wrapper<int64_t>;
+using doubles_col      = cudf::test::fixed_width_column_wrapper<double>;
+using strings_col      = cudf::test::strings_column_wrapper;
+using structs_col      = cudf::test::structs_column_wrapper;
+using int32s_lists_col = cudf::test::lists_column_wrapper<int32_t>;
+
+auto write_file(std::vector<std::unique_ptr<cudf::column>>& input_columns,
+                std::string const& filename,
+                bool nullable                    = false,
+                std::size_t stripe_size_bytes    = cudf::io::default_stripe_size_bytes,
+                cudf::size_type stripe_size_rows = cudf::io::default_stripe_size_rows)
+{
+  if (nullable) {
+    // Generate deterministic bitmask instead of random bitmask for easy computation of data size.
+    auto const valid_iter = cudf::detail::make_counting_transform_iterator(
+      0, [](cudf::size_type i) { return i % 4 != 3; });
+    cudf::size_type offset{0};
+    for (auto& col : input_columns) {
+      auto const [null_mask, null_count] =
+        cudf::test::detail::make_null_mask(valid_iter + offset, valid_iter + col->size() + offset);
+      col = cudf::structs::detail::superimpose_nulls(
+        static_cast<cudf::bitmask_type const*>(null_mask.data()),
+        null_count,
+        std::move(col),
+        cudf::get_default_stream(),
+        rmm::mr::get_current_device_resource());
+
+      // Shift nulls of the next column by one position, to avoid having all nulls
+      // in the same table rows.
+      ++offset;
+    }
+  }
+
+  auto input_table = std::make_unique<cudf::table>(std::move(input_columns));
+  auto filepath =
+    temp_env->get_temp_filepath(nullable ? filename + "_nullable.orc" : filename + ".orc");
+
+  auto const write_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, *input_table)
+      .stripe_size_bytes(stripe_size_bytes)
+      .stripe_size_rows(stripe_size_rows)
+      .build();
+  cudf::io::write_orc(write_opts);
+
+  return std::pair{std::move(input_table), std::move(filepath)};
+}
+
+// NOTE: By default, output_row_granularity=10'000 rows.
+// This means if the input file has more than 10k rows then the output chunk will never
+// have less than 10k rows.
+auto chunked_read(std::string const& filepath,
+                  output_limit output_limit_bytes,
+                  input_limit input_limit_bytes             = input_limit{0},
+                  output_row_granularity output_granularity = output_row_granularity{10'000})
+{
+  auto const read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).build();
+  auto reader = cudf::io::chunked_orc_reader(static_cast<std::size_t>(output_limit_bytes),
+                                             static_cast<std::size_t>(input_limit_bytes),
+                                             static_cast<cudf::size_type>(output_granularity),
+                                             read_opts);
+
+  auto num_chunks = 0;
+  auto out_tables = std::vector<std::unique_ptr<cudf::table>>{};
+
+  // TODO: remove this scope, when we get rid of mem stat in the reader.
+  // This is to avoid use-after-free of memory resource created by the mem stat object.
+  auto mr = rmm::mr::get_current_device_resource();
+
+  do {
+    auto chunk = reader.read_chunk();
+    // If the input file is empty, the first call to `read_chunk` will return an empty table.
+    // Thus, we only check for non-empty output table from the second call.
+    if (num_chunks > 0) {
+      CUDF_EXPECTS(chunk.tbl->num_rows() != 0, "Number of rows in the new chunk is zero.");
+    }
+    ++num_chunks;
+    out_tables.emplace_back(std::move(chunk.tbl));
+  } while (reader.has_next());
+
+  if (num_chunks > 1) {
+    CUDF_EXPECTS(out_tables.front()->num_rows() != 0, "Number of rows in the new chunk is zero.");
+  }
+
+  auto out_tviews = std::vector<cudf::table_view>{};
+  for (auto const& tbl : out_tables) {
+    out_tviews.emplace_back(tbl->view());
+  }
+
+  // return std::pair(cudf::concatenate(out_tviews), num_chunks);
+
+  // TODO: remove this
+  return std::pair(cudf::concatenate(out_tviews, cudf::get_default_stream(), mr), num_chunks);
+}
+
+auto chunked_read(std::string const& filepath,
+                  output_limit output_limit_bytes,
+                  output_row_granularity output_granularity)
+{
+  return chunked_read(filepath, output_limit_bytes, input_limit{0UL}, output_granularity);
+}
+
+}  // namespace
+
+struct OrcChunkedReaderTest : public cudf::test::BaseFixture {};
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadNoData)
+{
+  std::vector<std::unique_ptr<cudf::column>> input_columns;
+  input_columns.emplace_back(int32s_col{}.release());
+  input_columns.emplace_back(int64s_col{}.release());
+
+  auto const [expected, filepath] = write_file(input_columns, "chunked_read_empty");
+  auto const [result, num_chunks] = chunked_read(filepath, output_limit{1'000});
+  EXPECT_EQ(num_chunks, 1);
+  EXPECT_EQ(result->num_rows(), 0);
+  EXPECT_EQ(result->num_columns(), 2);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadInvalidParameter)
+{
+  std::vector<std::unique_ptr<cudf::column>> input_columns;
+  input_columns.emplace_back(int32s_col{}.release());
+  input_columns.emplace_back(int64s_col{}.release());
+
+  auto const [expected, filepath] = write_file(input_columns, "chunked_read_invalid");
+  EXPECT_THROW(
+    chunked_read(filepath, output_limit{1'000}, output_row_granularity{-1} /*invalid value*/),
+    cudf::logic_error);
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadSimpleData)
+{
+  auto constexpr num_rows = 40'000;
+
+  auto const generate_input = [num_rows](bool nullable, std::size_t stripe_rows) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const value_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release());
+    input_columns.emplace_back(int64s_col(value_iter, value_iter + num_rows).release());
+
+    return write_file(input_columns,
+                      "chunked_read_simple",
+                      nullable,
+                      cudf::io::default_stripe_size_bytes,
+                      stripe_rows);
+  };
+
+  {
+    auto const [expected, filepath] = generate_input(false, 1'000);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{245'000});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+  {
+    auto const [expected, filepath] = generate_input(false, cudf::io::default_stripe_size_rows);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{245'000});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  {
+    auto const [expected, filepath] = generate_input(true, 1'000);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{245'000});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+  {
+    auto const [expected, filepath] = generate_input(true, cudf::io::default_stripe_size_rows);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{245'000});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadBoundaryCases)
+{
+  // Tests some specific boundary conditions in the split calculations.
+
+  auto constexpr num_rows = 40'000;
+
+  auto const [expected, filepath] = [num_rows]() {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const value_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release());
+    return write_file(input_columns, "chunked_read_simple_boundary");
+  }();
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{1UL});
+    // Number of chunks is 4 because of using default `output_row_granularity = 10k`.
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte, and small value of `output_row_granularity`.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{1UL}, output_row_granularity{1'000});
+    EXPECT_EQ(num_chunks, 40);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte, and large value of `output_row_granularity`.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{1UL}, output_row_granularity{30'000});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+  // Test with a very large limit
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+  // Test with a limit slightly less than one granularity segment of data
+  // (output_row_granularity = 10k rows = 40'000 bytes).
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{39'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size one granularity segment of data
+  // (output_row_granularity = 10k rows = 40'000 bytes).
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{40'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit slightly more than one granularity segment of data
+  // (output_row_granularity = 10k rows = 40'000 bytes).
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{41'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit slightly less than two granularity segments of data
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{79'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size of two granularity segments of data minus 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{79'999UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size of two granularity segments of data.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{80'000UL});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit slightly more the size two granularity segments of data.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{81'000});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size of the input minus 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{159'999UL});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size of the input.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{160'000UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithString)
+{
+  auto constexpr num_rows           = 60'000;
+  auto constexpr output_granularity = output_row_granularity{20'000};
+
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const value_iter = thrust::make_counting_iterator(0);
+
+    // ints                               Granularity Segment  total bytes   cumulative bytes
+    // 20000 rows of 4 bytes each               = A0           80000         80000
+    // 20000 rows of 4 bytes each               = A1           80000         160000
+    // 20000 rows of 4 bytes each               = A2           80000         240000
+    input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release());
+
+    // strings                            Granularity Segment  total bytes   cumulative bytes
+    // 20000 rows of 1 char each    (20000  + 80004) = B0      100004        100004
+    // 20000 rows of 4 chars each   (80000  + 80004) = B1      160004        260008
+    // 20000 rows of 16 chars each  (320000 + 80004) = B2      400004        660012
+    auto const strings  = std::vector<std::string>{"a", "bbbb", "cccccccccccccccc"};
+    auto const str_iter = cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) {
+      if (i < 20000) { return strings[0]; }
+      if (i < 40000) { return strings[1]; }
+      return strings[2];
+    });
+    input_columns.emplace_back(strings_col(str_iter, str_iter + num_rows).release());
+
+    // Cumulative sizes:
+    // A0 + B0 :  180004
+    // A1 + B1 :  420008
+    // A2 + B2 :  900012
+    //                                    skip_rows / num_rows
+    // byte_limit==500000  should give 2 chunks: {0, 40000}, {40000, 20000}
+    // byte_limit==1000000 should give 1 chunks: {0, 60000},
+    return write_file(input_columns, "chunked_read_with_strings", nullable);
+  };
+
+  auto const [expected_no_null, filepath_no_null]       = generate_input(false);
+  auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, output_limit{1UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, output_limit{1UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very large limit.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Other tests:
+
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, output_limit{500'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, output_limit{500'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'000'000UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'000'000UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructs)
+{
+  auto constexpr num_rows           = 100'000;
+  auto constexpr output_granularity = output_row_granularity{20'000};
+
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const int_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+    input_columns.emplace_back([=] {
+      auto child1 = int32s_col(int_iter, int_iter + num_rows);
+      auto child2 = int32s_col(int_iter + num_rows, int_iter + num_rows * 2);
+
+      auto const str_iter = cudf::detail::make_counting_transform_iterator(
+        0, [&](int32_t i) { return std::to_string(i); });
+      auto child3 = strings_col{str_iter, str_iter + num_rows};
+
+      return structs_col{{child1, child2, child3}}.release();
+    }());
+
+    return write_file(input_columns, "chunked_read_with_structs", nullable);
+  };
+
+  auto const [expected_no_null, filepath_no_null]       = generate_input(false);
+  auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, output_limit{1UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, output_limit{1UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very large limit.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, output_limit{2L << 40}, output_granularity);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, output_limit{2L << 40}, output_granularity);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Other tests:
+
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, output_limit{500'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, output_limit{500'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsNoNulls)
+{
+  auto constexpr num_rows           = 100'000;
+  auto constexpr output_granularity = output_row_granularity{20'000};
+
+  auto const [expected, filepath] = [num_rows]() {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    // 20000 rows in 1 segment consist of:
+    //
+    // 20001 offsets :   80004  bytes
+    // 30000 ints    :   120000 bytes
+    // total         :   200004 bytes
+    //
+    // However, `segmented_row_bit_count` used in chunked reader returns 200000,
+    // thus we consider as having only 200000 bytes in total.
+    auto const template_lists = int32s_lists_col{
+      int32s_lists_col{}, int32s_lists_col{0}, int32s_lists_col{1, 2}, int32s_lists_col{3, 4, 5}};
+
+    auto const gather_iter =
+      cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { return i % 4; });
+    auto const gather_map = int32s_col(gather_iter, gather_iter + num_rows);
+    input_columns.emplace_back(
+      std::move(cudf::gather(cudf::table_view{{template_lists}}, gather_map)->release().front()));
+
+    return write_file(input_columns, "chunked_read_with_lists_no_null");
+  }();
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{1UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very large limit.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{2L << 40UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size slightly less than 1 row segment (forcing it to be at least 1 segment per read).
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{199'999UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size exactly 1 row segment.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{200'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size == size of 2 segments. Totally have 3 chunks.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{400'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size == size of 2 segment minus one byte: each chunk will be just one segment.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{399'999UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsHavingNulls)
+{
+  auto constexpr num_rows           = 100'000;
+  auto constexpr output_granularity = output_row_granularity{20'000};
+
+  auto const [expected, filepath] = [num_rows]() {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    // 20000 rows in 1 page consist of:
+    //
+    // 625 validity words :   2500 bytes   (a null every 4 rows: null at indices [3, 7, 11, ...])
+    // 20001 offsets      :   80004  bytes
+    // 15000 ints         :   60000 bytes
+    // total              :   142504 bytes
+    //
+    // However, `segmented_row_bit_count` used in chunked reader returns 142500,
+    // thus we consider as having only 142500 bytes in total.
+    auto const template_lists =
+      int32s_lists_col{// these will all be null
+                       int32s_lists_col{},
+                       int32s_lists_col{0},
+                       int32s_lists_col{1, 2},
+                       int32s_lists_col{3, 4, 5, 6, 7, 8, 9} /* this list will be nullified out */};
+    auto const gather_iter =
+      cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { return i % 4; });
+    auto const gather_map = int32s_col(gather_iter, gather_iter + num_rows);
+    input_columns.emplace_back(
+      std::move(cudf::gather(cudf::table_view{{template_lists}}, gather_map)->release().front()));
+
+    return write_file(input_columns, "chunked_read_with_lists_nulls", true /*nullable*/);
+  }();
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{1UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very large limit.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{2L << 40}, output_granularity);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size slightly less than 1 row segment (forcing it to be at least 1 segment per read).
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{142'499UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size exactly 1 row segment.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{142'500UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size == size of 2 segments. Totally have 3 chunks.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{285'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size == size of 2 segment minus one byte: each chunk will be just one segment.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{284'999UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructsOfLists)
+{
+  auto constexpr num_rows = 100'000;
+
+  // Size of each segment (10k row by default) is from 537k to 560k bytes (no nulls)
+  // and from 456k to 473k (with nulls).
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const int_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+    input_columns.emplace_back([=] {
+      std::vector<std::unique_ptr<cudf::column>> child_columns;
+      child_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+      child_columns.emplace_back(
+        int32s_col(int_iter + num_rows, int_iter + num_rows * 2).release());
+
+      auto const str_iter = cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) {
+        return std::to_string(i) + "++++++++++++++++++++" + std::to_string(i);
+      });
+      child_columns.emplace_back(strings_col{str_iter, str_iter + num_rows}.release());
+
+      auto const template_lists = int32s_lists_col{
+        int32s_lists_col{}, int32s_lists_col{0}, int32s_lists_col{0, 1}, int32s_lists_col{0, 1, 2}};
+      auto const gather_iter =
+        cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { return i % 4; });
+      auto const gather_map = int32s_col(gather_iter, gather_iter + num_rows);
+      child_columns.emplace_back(
+        std::move(cudf::gather(cudf::table_view{{template_lists}}, gather_map)->release().front()));
+
+      return structs_col(std::move(child_columns)).release();
+    }());
+
+    return write_file(input_columns, "chunked_read_with_structs_of_lists", nullable);
+  };
+
+  auto const [expected_no_null, filepath_no_null]       = generate_input(false);
+  auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1UL});
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1UL});
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very large limit.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Other tests:
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'000'000UL});
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'500'000UL});
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2'000'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{5'000'000UL});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'000'000UL});
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'500'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2'000'000UL});
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{5'000'000UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsOfStructs)
+{
+  auto constexpr num_rows = 100'000;
+
+  // Size of each segment (10k row by default) is from 450k to 530k bytes (no nulls)
+  // and from 330k to 380k (with nulls).
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const int_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+
+    auto offsets = std::vector<cudf::size_type>{};
+    offsets.reserve(num_rows * 2);
+    cudf::size_type num_structs = 0;
+    for (int i = 0; i < num_rows; ++i) {
+      offsets.push_back(num_structs);
+      auto const new_list_size = i % 4;
+      num_structs += new_list_size;
+    }
+    offsets.push_back(num_structs);
+
+    auto const make_structs_col = [=] {
+      auto child1 = int32s_col(int_iter, int_iter + num_structs);
+      auto child2 = int32s_col(int_iter + num_structs, int_iter + num_structs * 2);
+
+      auto const str_iter = cudf::detail::make_counting_transform_iterator(
+        0, [&](int32_t i) { return std::to_string(i) + std::to_string(i) + std::to_string(i); });
+      auto child3 = strings_col{str_iter, str_iter + num_structs};
+
+      return structs_col{{child1, child2, child3}}.release();
+    };
+
+    input_columns.emplace_back(
+      cudf::make_lists_column(static_cast<cudf::size_type>(offsets.size() - 1),
+                              int32s_col(offsets.begin(), offsets.end()).release(),
+                              make_structs_col(),
+                              0,
+                              rmm::device_buffer{}));
+
+    return write_file(input_columns, "chunked_read_with_lists_of_structs", nullable);
+  };
+
+  auto const [expected_no_null, filepath_no_null]       = generate_input(false);
+  auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1UL});
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1UL});
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very large limit.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Other tests.
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'000'000UL});
+    EXPECT_EQ(num_chunks, 7);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'500'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2'000'000UL});
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{5'000'000UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'000'000UL});
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'500'000UL});
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2'000'000UL});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{5'000'000UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadNullCount)
+{
+  auto constexpr num_rows = 100'000;
+
+  auto const sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; });
+  auto const validity =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 4 != 3; });
+  std::vector<std::unique_ptr<cudf::column>> cols;
+  cols.push_back(int32s_col{sequence, sequence + num_rows, validity}.release());
+  auto const expected = std::make_unique<cudf::table>(std::move(cols));
+
+  auto const filepath          = temp_env->get_temp_filepath("chunked_reader_null_count.orc");
+  auto const stripe_limit_rows = num_rows / 5;
+  auto const write_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, *expected)
+      .stripe_size_rows(stripe_limit_rows)
+      .build();
+  cudf::io::write_orc(write_opts);
+
+  auto const byte_limit = stripe_limit_rows * sizeof(int);
+  auto const read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).build();
+  auto reader =
+    cudf::io::chunked_orc_reader(byte_limit, 0UL /*read_limit*/, stripe_limit_rows, read_opts);
+
+  do {
+    // Every fourth row is null.
+    EXPECT_EQ(reader.read_chunk().tbl->get_column(0).null_count(), stripe_limit_rows / 4UL);
+  } while (reader.has_next());
+}
+
+namespace {
+
+std::size_t constexpr input_limit_expected_file_count = 3;
+
+std::vector<std::string> input_limit_get_test_names(std::string const& base_filename)
+{
+  return {base_filename + "_a.orc", base_filename + "_b.orc", base_filename + "_c.orc"};
+}
+
+void input_limit_test_write_one(std::string const& filepath,
+                                cudf::table_view const& input,
+                                cudf::size_type stripe_size_rows,
+                                cudf::io::compression_type compression)
+{
+  auto const out_opts = cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, input)
+                          .compression(compression)
+                          .stripe_size_rows(stripe_size_rows)
+                          .build();
+  cudf::io::write_orc(out_opts);
+}
+
+void input_limit_test_write(
+  std::vector<std::string> const& test_files,
+  cudf::table_view const& input,
+  cudf::size_type stripe_size_rows = 20'000 /*write relatively small stripes by default*/)
+{
+  CUDF_EXPECTS(test_files.size() == input_limit_expected_file_count,
+               "Unexpected count of test filenames.");
+
+  // ZSTD yields a very small decompression size, can be much smaller than SNAPPY.
+  // However, ORC reader typically over-estimates the decompression size of data
+  // compressed by ZSTD to be very large, can be much larger than that of SNAPPY.
+  // That is because ZSTD may use a lot of scratch space at decode time
+  // (2.5x the total decompressed buffer size).
+  // As such, we may see smaller output chunks for the input data compressed by ZSTD.
+  input_limit_test_write_one(
+    test_files[0], input, stripe_size_rows, cudf::io::compression_type::NONE);
+  input_limit_test_write_one(
+    test_files[1], input, stripe_size_rows, cudf::io::compression_type::ZSTD);
+  input_limit_test_write_one(
+    test_files[2], input, stripe_size_rows, cudf::io::compression_type::SNAPPY);
+}
+
+void input_limit_test_read(int test_location,
+                           std::vector<std::string> const& test_files,
+                           cudf::table_view const& input,
+                           output_limit output_limit_bytes,
+                           input_limit input_limit_bytes,
+                           int const* expected_chunk_counts)
+{
+  CUDF_EXPECTS(test_files.size() == input_limit_expected_file_count,
+               "Unexpected count of test filenames.");
+
+  for (size_t idx = 0; idx < test_files.size(); ++idx) {
+    SCOPED_TRACE("Original line of failure: " + std::to_string(test_location) +
+                 ", file idx: " + std::to_string(idx));
+    auto const [result, num_chunks] =
+      chunked_read(test_files[idx], output_limit_bytes, input_limit_bytes);
+    EXPECT_EQ(expected_chunk_counts[idx], num_chunks);
+    // TODO: equal
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, input);
+  }
+}
+
+}  // namespace
+
+struct OrcChunkedReaderInputLimitTest : public cudf::test::BaseFixture {};
+
+TEST_F(OrcChunkedReaderInputLimitTest, SingleFixedWidthColumn)
+{
+  auto constexpr num_rows = 1'000'000;
+  auto const iter1        = thrust::make_constant_iterator(15);
+  auto const col1         = doubles_col(iter1, iter1 + num_rows);
+
+  auto const filename   = std::string{"single_col_fixed_width"};
+  auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
+  auto const input      = cudf::table_view{{col1}};
+  input_limit_test_write(test_files, input);
+
+  {
+    int constexpr expected[] = {50, 50, 50};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{1UL}, expected);
+  }
+
+  {
+    int constexpr expected[] = {17, 13, 10};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{2 * 1024 * 1024UL}, expected);
+  }
+}
+
+TEST_F(OrcChunkedReaderInputLimitTest, MixedColumns)
+{
+  auto constexpr num_rows = 1'000'000;
+
+  auto const iter1 = thrust::make_counting_iterator<int>(0);
+  auto const col1  = int32s_col(iter1, iter1 + num_rows);
+
+  auto const iter2 = thrust::make_counting_iterator<double>(0);
+  auto const col2  = doubles_col(iter2, iter2 + num_rows);
+
+  auto const strings  = std::vector<std::string>{"abc", "de", "fghi"};
+  auto const str_iter = cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) {
+    if (i < 250000) { return strings[0]; }
+    if (i < 750000) { return strings[1]; }
+    return strings[2];
+  });
+  auto const col3     = strings_col(str_iter, str_iter + num_rows);
+
+  auto const filename   = std::string{"mixed_columns"};
+  auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
+  auto const input      = cudf::table_view{{col1, col2, col3}};
+  input_limit_test_write(test_files, input);
+
+  {
+    int constexpr expected[] = {50, 50, 50};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{1UL}, expected);
+  }
+
+  {
+    int constexpr expected[] = {17, 50, 17};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{2 * 1024 * 1024UL}, expected);
+  }
+}
+
+namespace {
+
+struct offset_gen {
+  int const group_size;
+  __device__ int operator()(int i) const { return i * group_size; }
+};
+
+template <typename T>
+struct value_gen {
+  __device__ T operator()(int i) const { return i % 1024; }
+};
+
+struct char_values {
+  __device__ int8_t operator()(int i) const
+  {
+    int const index = (i / 2) % 3;
+    // Generate repeating 3-runs of 2 values each: "aabbccaabbcc...".
+    return index == 0 ? 'a' : (index == 1 ? 'b' : 'c');
+  }
+};
+
+}  // namespace
+
+TEST_F(OrcChunkedReaderInputLimitTest, ListType)
+{
+  int constexpr num_rows  = 50'000'000;
+  int constexpr list_size = 4;
+
+  auto const stream = cudf::get_default_stream();
+  auto const iter   = thrust::make_counting_iterator(0);
+
+  auto offset_col = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_rows + 1,
+                    offset_col->mutable_view().begin<int>(),
+                    offset_gen{list_size});
+
+  int constexpr num_ints = num_rows * list_size;
+  auto value_col         = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_ints, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_ints,
+                    value_col->mutable_view().begin<int>(),
+                    value_gen<int>{});
+
+  auto const lists_col =
+    cudf::make_lists_column(num_rows, std::move(offset_col), std::move(value_col), 0, {}, stream);
+
+  auto const filename   = std::string{"list_type"};
+  auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
+  auto const input      = cudf::table_view{{*lists_col}};
+
+  // Although we set `stripe_size_rows` to be very large, the writer only write
+  // 250k rows (top level) per stripe due to having nested type.
+  // Thus, we have 200 stripes in total.
+  input_limit_test_write(test_files, input, cudf::io::default_stripe_size_rows);
+
+  {
+    int constexpr expected[] = {3, 40, 3};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{5 * 1024 * 1024UL}, expected);
+  }
+
+  {
+    int constexpr expected[] = {8, 40, 9};
+    input_limit_test_read(__LINE__,
+                          test_files,
+                          input,
+                          output_limit{128 * 1024 * 1024UL},
+                          input_limit{5 * 1024 * 1024UL},
+                          expected);
+  }
+}
+
+TEST_F(OrcChunkedReaderInputLimitTest, MixedColumnsHavingList)
+{
+  int constexpr num_rows  = 50'000'000;
+  int constexpr list_size = 4;
+  int constexpr str_size  = 3;
+
+  auto const stream = cudf::get_default_stream();
+  auto const iter   = thrust::make_counting_iterator(0);
+
+  // list<int>
+  auto offset_col = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_rows + 1,
+                    offset_col->mutable_view().begin<int>(),
+                    offset_gen{list_size});
+
+  int constexpr num_ints = num_rows * list_size;
+  auto value_col         = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_ints, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_ints,
+                    value_col->mutable_view().begin<int>(),
+                    value_gen<int>{});
+
+  auto const lists_col =
+    cudf::make_lists_column(num_rows, std::move(offset_col), std::move(value_col), 0, {}, stream);
+
+  // strings
+  int constexpr num_chars = num_rows * str_size;
+  auto str_offset_col     = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_rows + 1,
+                    str_offset_col->mutable_view().begin<int>(),
+                    offset_gen{str_size});
+  rmm::device_buffer str_chars(num_chars, stream);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_chars,
+                    static_cast<int8_t*>(str_chars.data()),
+                    char_values{});
+  auto const str_col =
+    cudf::make_strings_column(num_rows, std::move(str_offset_col), std::move(str_chars), 0, {});
+
+  // doubles
+  auto const double_col = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::FLOAT64}, num_rows, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_rows,
+                    double_col->mutable_view().begin<double>(),
+                    value_gen<double>{});
+
+  auto const filename   = std::string{"mixed_cols_having_list"};
+  auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
+  auto const input      = cudf::table_view{{*lists_col, *str_col, *double_col}};
+
+  // Although we set `stripe_size_rows` to be very large, the writer only write
+  // 250k rows (top level) per stripe due to having nested type.
+  // Thus, we have 200 stripes in total.
+  input_limit_test_write(test_files, input, cudf::io::default_stripe_size_rows);
+
+  {
+    int constexpr expected[] = {13, 8, 6};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{128 * 1024 * 1024UL}, expected);
+  }
+
+  {
+    int constexpr expected[] = {13, 15, 17};
+    input_limit_test_read(__LINE__,
+                          test_files,
+                          input,
+                          output_limit{128 * 1024 * 1024UL},
+                          input_limit{128 * 1024 * 1024UL},
+                          expected);
+  }
+}
+
+TEST_F(OrcChunkedReaderInputLimitTest, ReadWithRowSelection)
+{
+  // `num_rows` should not be divisible by `stripe_size_rows`, to test the correctness of row
+  // selections.
+  int64_t constexpr num_rows    = 100'517'687l;
+  int constexpr rows_per_stripe = 100'000;
+  static_assert(num_rows % rows_per_stripe != 0,
+                "`num_rows` should not be divisible by `stripe_size_rows`.");
+
+  auto const it    = thrust::make_counting_iterator(0);
+  auto const col   = int32s_col(it, it + num_rows);
+  auto const input = cudf::table_view{{col}};
+
+  auto const filepath = temp_env->get_temp_filepath("chunk_read_with_row_selection.orc");
+  auto const write_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, input)
+      .stripe_size_rows(rows_per_stripe)
+      .build();
+  cudf::io::write_orc(write_opts);
+
+  // Verify metadata.
+  auto const metadata = cudf::io::read_orc_metadata(cudf::io::source_info{filepath});
+  EXPECT_EQ(metadata.num_rows(), num_rows);
+  EXPECT_EQ(metadata.num_stripes(), num_rows / rows_per_stripe + 1);
+
+  int constexpr random_val = 123456;
+
+  // Read some random number or rows that is not stripe size.
+  int constexpr num_rows_to_read = rows_per_stripe * 5 + random_val;
+
+  // Just shift the read data region back by a random offset.
+  const auto num_rows_to_skip = num_rows - num_rows_to_read - random_val;
+
+  const auto sequence_start = num_rows_to_skip % num_rows;
+  auto const skipped_col = int32s_col(it + sequence_start, it + sequence_start + num_rows_to_read);
+  auto const expected    = cudf::table_view{{skipped_col}};
+
+  auto const read_opts = cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
+                           .use_index(false)
+                           .skip_rows(num_rows_to_skip)
+                           .num_rows(num_rows_to_read)
+                           .build();
+
+  auto reader = cudf::io::chunked_orc_reader(
+    60'000UL * sizeof(int) /*output limit, equal to 60k rows, less than rows in 1 stripe*/,
+    rows_per_stripe * sizeof(int) /*input limit, around size of 1 stripe's decoded data*/,
+    50'000 /*output granularity, or minimum number of rows for the output chunk*/,
+    read_opts);
+
+  auto num_chunks  = 0;
+  auto read_tables = std::vector<std::unique_ptr<cudf::table>>{};
+  auto tviews      = std::vector<cudf::table_view>{};
+
+  do {
+    auto chunk = reader.read_chunk();
+    // Each output chunk should have either exactly 50k rows, or num_rows_to_read % 50k.
+    EXPECT_TRUE(chunk.tbl->num_rows() == 50000 ||
+                chunk.tbl->num_rows() == num_rows_to_read % 50000);
+
+    tviews.emplace_back(chunk.tbl->view());
+    read_tables.emplace_back(std::move(chunk.tbl));
+    ++num_chunks;
+  } while (reader.has_next());
+
+  auto const read_result = cudf::concatenate(tviews);
+  EXPECT_EQ(num_chunks, 13);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, read_result->view());
+}
+
+TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
+{
+  using data_type = int16_t;
+  using data_col  = cudf::test::fixed_width_column_wrapper<data_type, int64_t>;
+
+  int64_t constexpr num_rows    = 500'000'000l;
+  int constexpr rows_per_stripe = 1'000'000;
+  int constexpr num_reps        = 10;
+  int64_t constexpr total_rows  = num_rows * num_reps;
+  static_assert(total_rows > std::numeric_limits<cudf::size_type>::max());
+
+  auto const it  = cudf::detail::make_counting_transform_iterator(0l, [num_rows](int64_t i) {
+    return (i % num_rows) % static_cast<int64_t>(std::numeric_limits<data_type>::max() / 2);
+  });
+  auto const col = data_col(it, it + num_rows);
+  auto const chunk_table = cudf::table_view{{col}};
+
+  std::vector<char> data_buffer;
+  {
+    auto const write_opts =
+      cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&data_buffer})
+        .stripe_size_rows(rows_per_stripe)
+        .build();
+
+    auto writer = cudf::io::orc_chunked_writer(write_opts);
+    for (int i = 0; i < num_reps; ++i) {
+      writer.write(chunk_table);
+    }
+  }
+
+  // Verify metadata.
+  auto const metadata =
+    cudf::io::read_orc_metadata(cudf::io::source_info{data_buffer.data(), data_buffer.size()});
+  EXPECT_EQ(metadata.num_rows(), total_rows);
+  EXPECT_EQ(metadata.num_stripes(), total_rows / rows_per_stripe);
+
+  // Read with row selections and memory limit.
+  {
+    int64_t constexpr num_rows_to_read = 5'000'000l;
+    int64_t const num_rows_to_skip =
+      static_cast<int64_t>(metadata.num_rows()) - num_rows_to_read -
+      123456l /*just shift the read data region back by a random offset*/;
+
+    // Check validity of the last 5 million rows.
+    auto const sequence_start = num_rows_to_skip % num_rows;
+    auto const skipped_col = data_col(it + sequence_start, it + sequence_start + num_rows_to_read);
+    auto const expected    = cudf::table_view{{skipped_col}};
+
+    auto const read_opts = cudf::io::orc_reader_options::builder(
+                             cudf::io::source_info{data_buffer.data(), data_buffer.size()})
+                             .use_index(false)
+                             .skip_rows(num_rows_to_skip)
+                             .num_rows(num_rows_to_read)
+                             .build();
+    auto reader = cudf::io::chunked_orc_reader(
+      600'000UL * sizeof(data_type) /* output limit, equal to 600k rows */,
+      rows_per_stripe * sizeof(data_type) /* input limit, around size of 1 stripe's decoded data */,
+      rows_per_stripe / 2 /* output granularity, or minimum number of rows for the output chunk */,
+      read_opts);
+
+    auto num_chunks  = 0;
+    auto read_tables = std::vector<std::unique_ptr<cudf::table>>{};
+    auto tviews      = std::vector<cudf::table_view>{};
+
+    do {
+      auto chunk = reader.read_chunk();
+      ++num_chunks;
+      tviews.emplace_back(chunk.tbl->view());
+      read_tables.emplace_back(std::move(chunk.tbl));
+    } while (reader.has_next());
+
+    auto const read_result = cudf::concatenate(tviews);
+    EXPECT_EQ(num_chunks, 11);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, read_result->view());
+  }
+
+  // The test below requires a huge amount of memory, thus it is disabled by default.
+#ifdef LOCAL_TEST
+  // Read with only output limit -- there is no limit on the memory usage.
+  // However, the reader should be able to detect and load only enough stripes each time
+  // to avoid decoding a table having number of rows that exceeds the column size limit.
+  {
+    auto const read_opts = cudf::io::orc_reader_options::builder(
+                             cudf::io::source_info{data_buffer.data(), data_buffer.size()})
+                             .use_index(false)
+                             .build();
+    auto reader = cudf::io::chunked_orc_reader(
+      static_cast<std::size_t>(rows_per_stripe * 5.7) *
+        sizeof(data_type) /* output limit, equal to 5.7M rows */,
+      0UL /* no input limit */,
+      rows_per_stripe / 2 /* output granularity, or minimum number of rows for the output chunk */,
+      read_opts);
+
+    int num_chunks          = 0;
+    int64_t num_read_rows   = 0;
+    int64_t test_rows_start = 0;
+    auto test_chunk         = std::unique_ptr<cudf::table>{};
+
+    do {
+      auto chunk            = reader.read_chunk();
+      auto const chunk_rows = chunk.tbl->num_rows();
+
+      // Just randomly select one output chunk to verify.
+      if (num_chunks == 123) {
+        test_rows_start = num_read_rows;
+        test_chunk      = std::move(chunk.tbl);
+      }
+
+      ++num_chunks;
+      num_read_rows += chunk_rows;
+    } while (reader.has_next());
+
+    EXPECT_EQ(num_read_rows, total_rows);
+
+    // Typically, we got a chunk having 5M rows.
+    // However, since the reader internally splits file stripes that are not multiple of 5 stripes,
+    // we may have some extra chunks that have less than 5M rows.
+    EXPECT_EQ(num_chunks, 1002);
+
+    // Verify the selected chunk.
+    using namespace cudf::test::iterators;
+    auto const skipped_col =
+      data_col(it + test_rows_start, it + test_rows_start + test_chunk->num_rows(), no_nulls());
+    auto const expected = cudf::table_view{{skipped_col}};
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, test_chunk->view());
+  }
+
+#endif  // LOCAL_TEST
+}
diff --git a/cpp/tests/io/row_selection_test.cpp b/cpp/tests/io/row_selection_test.cpp
index 0c259c81a23..ebadd870091 100644
--- a/cpp/tests/io/row_selection_test.cpp
+++ b/cpp/tests/io/row_selection_test.cpp
@@ -122,17 +122,4 @@ TEST_F(FromOptsTest, LimitOptionsToFileRows)
   }
 }
 
-TEST_F(FromOptsTest, OverFlowDetection)
-{
-  auto const too_large_for_32bit = std::numeric_limits<int64_t>::max();
-
-  // Too many rows to read until the end of the file
-  EXPECT_THROW(skip_rows_num_rows_from_options(0, std::nullopt, too_large_for_32bit),
-               std::overflow_error);
-
-  // Should work fine with num_rows
-  EXPECT_NO_THROW(
-    skip_rows_num_rows_from_options(1000, too_large_for_32bit - 100, too_large_for_32bit));
-}
-
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index d5ac8574fe4..d5bb1726a43 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -1,6 +1,6 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libc.stdint cimport uint8_t
+from libc.stdint cimport int64_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
@@ -21,8 +21,8 @@ cdef extern from "cudf/io/orc.hpp" \
 
         cudf_io_types.source_info get_source() except +
         vector[vector[size_type]] get_stripes() except +
-        size_type get_skip_rows() except +
-        size_type get_num_rows() except +
+        int64_t get_skip_rows() except +
+        optional[int64_t] get_num_rows() except +
         bool is_enabled_use_index() except +
         bool is_enabled_use_np_dtypes() except +
         data_type get_timestamp_type() except +
@@ -31,8 +31,8 @@ cdef extern from "cudf/io/orc.hpp" \
 
         void set_columns(vector[string] col_names) except +
         void set_stripes(vector[vector[size_type]] strps) except +
-        void set_skip_rows(size_type rows) except +
-        void set_num_rows(size_type nrows) except +
+        void set_skip_rows(int64_t rows) except +
+        void set_num_rows(int64_t nrows) except +
         void enable_use_index(bool val) except +
         void enable_use_np_dtypes(bool val) except +
         void set_timestamp_type(data_type type) except +
@@ -49,8 +49,8 @@ cdef extern from "cudf/io/orc.hpp" \
         orc_reader_options_builder& columns(vector[string] col_names) except +
         orc_reader_options_builder& \
             stripes(vector[vector[size_type]] strps) except +
-        orc_reader_options_builder& skip_rows(size_type rows) except +
-        orc_reader_options_builder& num_rows(size_type nrows) except +
+        orc_reader_options_builder& skip_rows(int64_t rows) except +
+        orc_reader_options_builder& num_rows(int64_t nrows) except +
         orc_reader_options_builder& use_index(bool val) except +
         orc_reader_options_builder& use_np_dtypes(bool val) except +
         orc_reader_options_builder& timestamp_type(data_type type) except +
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 836880a6f2c..918880648bf 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -472,11 +472,11 @@ cdef int64_t get_skiprows_arg(object arg) except*:
         raise TypeError("skiprows must be an int >= 0")
     return <int64_t> arg
 
-cdef size_type get_num_rows_arg(object arg) except*:
+cdef int64_t get_num_rows_arg(object arg) except*:
     arg = -1 if arg is None else arg
     if not isinstance(arg, int) or arg < -1:
         raise TypeError("num_rows must be an int >= -1")
-    return <size_type> arg
+    return <int64_t> arg
 
 
 cdef orc_reader_options make_orc_reader_options(
@@ -484,7 +484,7 @@ cdef orc_reader_options make_orc_reader_options(
     object column_names,
     object stripes,
     int64_t skip_rows,
-    size_type num_rows,
+    int64_t num_rows,
     type_id timestamp_type,
     bool use_index
 ) except*:

From a27feabf1a46fd3fcf388c7cbadb49831416f8ba Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 3 May 2024 01:31:58 -1000
Subject: [PATCH 155/272] Preserve column metadata during more DataFrame
 operations (#15519)

Supersedes https://github.com/rapidsai/cudf/pull/15410/, adds a `ColumnAccessor._from_columns_like_self` that will preserve column attributes during DataFrame operations. This can wholly replace `_from_data_like_self`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15519
---
 python/cudf/cudf/core/column_accessor.py | 33 ++++++++++++++++++++++++
 python/cudf/cudf/core/dataframe.py       | 16 +++++++-----
 python/cudf/cudf/core/frame.py           | 33 ++++++++++++++----------
 python/cudf/cudf/core/indexed_frame.py   | 16 +++++++-----
 python/cudf/cudf/core/multiindex.py      |  6 +++--
 python/cudf/cudf/core/series.py          |  4 ++-
 python/cudf/cudf/tests/test_dataframe.py | 20 ++++++++++++++
 7 files changed, 99 insertions(+), 29 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 33085bede78..fbce6e02330 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import itertools
+import sys
 from collections import abc
 from functools import cached_property, reduce
 from typing import (
@@ -174,6 +175,38 @@ def __repr__(self) -> str:
         )
         return f"{type_info}\n{column_info}"
 
+    def _from_columns_like_self(
+        self, columns: abc.Iterable[ColumnBase], verify: bool = True
+    ):
+        """
+        Return a new ColumnAccessor with columns and the properties of self.
+
+        Parameters
+        ----------
+        columns : iterable of Columns
+            New columns for the ColumnAccessor.
+        verify : bool, optional
+            Whether to verify column length and type.
+        """
+        if sys.version_info.major >= 3 and sys.version_info.minor >= 10:
+            data = zip(self.names, columns, strict=True)
+        else:
+            columns = list(columns)
+            if len(columns) != len(self.names):
+                raise ValueError(
+                    f"The number of columns ({len(columns)}) must match "
+                    f"the number of existing column labels ({len(self.names)})."
+                )
+            data = zip(self.names, columns)
+        return type(self)(
+            data=dict(data),
+            multiindex=self.multiindex,
+            level_names=self.level_names,
+            rangeindex=self.rangeindex,
+            label_dtype=self.label_dtype,
+            verify=verify,
+        )
+
     @property
     def level_names(self) -> Tuple[Any, ...]:
         if self._level_names is None or len(self._level_names) == 0:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 1e6ae861679..bf8201e4dc1 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3036,8 +3036,11 @@ def where(self, cond, other=None, inplace=False):
 
         # First process the condition.
         if isinstance(cond, Series):
-            cond = self._from_data_like_self(
-                {name: cond._column for name in self._column_names},
+            cond = self._from_data(
+                self._data._from_columns_like_self(
+                    itertools.repeat(cond._column, len(self._column_names)),
+                    verify=False,
+                )
             )
         elif hasattr(cond, "__cuda_array_interface__"):
             cond = DataFrame(
@@ -3078,7 +3081,7 @@ def where(self, cond, other=None, inplace=False):
                 should be equal to number of columns of self"""
             )
 
-        out = {}
+        out = []
         for (name, col), other_col in zip(self._data.items(), other_cols):
             col, other_col = _check_and_cast_columns_with_other(
                 source_col=col,
@@ -3091,16 +3094,17 @@ def where(self, cond, other=None, inplace=False):
                     col, other_col, cond_col
                 )
 
-                out[name] = _make_categorical_like(result, self._data[name])
+                out.append(_make_categorical_like(result, self._data[name]))
             else:
                 out_mask = cudf._lib.null_mask.create_null_mask(
                     len(col),
                     state=cudf._lib.null_mask.MaskState.ALL_NULL,
                 )
-                out[name] = col.set_mask(out_mask)
+                out.append(col.set_mask(out_mask))
 
         return self._mimic_inplace(
-            self._from_data_like_self(out), inplace=inplace
+            self._from_data_like_self(self._data._from_columns_like_self(out)),
+            inplace=inplace,
         )
 
     @docutils.doc_apply(
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index cd42bf52ea1..017190ab5b4 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1120,7 +1120,9 @@ def isna(self):
         array([False, False,  True,  True, False, False])
         """
         data_columns = (col.isnull() for col in self._columns)
-        return self._from_data_like_self(zip(self._column_names, data_columns))
+        return self._from_data_like_self(
+            self._data._from_columns_like_self(data_columns)
+        )
 
     # Alias for isna
     isnull = isna
@@ -1199,7 +1201,9 @@ def notna(self):
         array([ True,  True, False, False,  True,  True])
         """
         data_columns = (col.notnull() for col in self._columns)
-        return self._from_data_like_self(zip(self._column_names, data_columns))
+        return self._from_data_like_self(
+            self._data._from_columns_like_self(data_columns)
+        )
 
     # Alias for notna
     notnull = notna
@@ -1506,7 +1510,9 @@ def _encode(self):
     @_cudf_nvtx_annotate
     def _unaryop(self, op):
         data_columns = (col.unary_operator(op) for col in self._columns)
-        return self._from_data_like_self(zip(self._column_names, data_columns))
+        return self._from_data_like_self(
+            self._data._from_columns_like_self(data_columns)
+        )
 
     @classmethod
     @_cudf_nvtx_annotate
@@ -1638,12 +1644,14 @@ def _apply_cupy_ufunc_to_operands(
     def __neg__(self):
         """Negate for integral dtypes, logical NOT for bools."""
         return self._from_data_like_self(
-            {
-                name: col.unary_operator("not")
-                if is_bool_dtype(col.dtype)
-                else -1 * col
-                for name, col in self._data.items()
-            }
+            self._data._from_columns_like_self(
+                (
+                    col.unary_operator("not")
+                    if col.dtype.kind == "b"
+                    else -1 * col
+                    for col in self._data.columns
+                )
+            )
         )
 
     @_cudf_nvtx_annotate
@@ -1897,10 +1905,9 @@ def __copy__(self):
     def __invert__(self):
         """Bitwise invert (~) for integral dtypes, logical NOT for bools."""
         return self._from_data_like_self(
-            {
-                name: _apply_inverse_column(col)
-                for name, col in self._data.items()
-            }
+            self._data._from_columns_like_self(
+                (_apply_inverse_column(col) for col in self._data.columns)
+            )
         )
 
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index bec97bd3290..62ee780ebbb 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -1903,13 +1903,15 @@ def nans_to_nulls(self):
         1  <NA>  3.14
         2  <NA>  <NA>
         """
-        result_data = {}
-        for name, col in self._data.items():
-            try:
-                result_data[name] = col.nans_to_nulls()
-            except AttributeError:
-                result_data[name] = col.copy()
-        return self._from_data_like_self(result_data)
+        result = (
+            col.nans_to_nulls()
+            if isinstance(col, cudf.core.column.NumericalColumn)
+            else col.copy()
+            for col in self._data.columns
+        )
+        return self._from_data_like_self(
+            self._data._from_columns_like_self(result)
+        )
 
     def _copy_type_metadata(
         self,
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 019daacddba..1ab42df111f 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -2088,6 +2088,8 @@ def _split_columns_by_levels(self, levels):
         return data_columns, index_columns, data_names, index_names
 
     def repeat(self, repeats, axis=None):
-        return self._from_columns_like_self(
-            Frame._repeat([*self._columns], repeats, axis), self._column_names
+        return self._from_data(
+            self._data._from_columns_like_self(
+                super()._repeat([*self._columns], repeats, axis)
+            )
         )
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 275dc664175..b6ed28f9093 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3654,7 +3654,9 @@ def pct_change(
     def where(self, cond, other=None, inplace=False):
         result_col = super().where(cond, other, inplace)
         return self._mimic_inplace(
-            self._from_data_like_self({self.name: result_col}),
+            self._from_data_like_self(
+                self._data._from_columns_like_self([result_col])
+            ),
             inplace=inplace,
         )
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index e287603de07..f52076407b5 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10986,3 +10986,23 @@ def test_squeeze(axis, data):
     result = df.squeeze(axis=axis)
     expected = df.to_pandas().squeeze(axis=axis)
     assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("column", [range(1), np.array([1], dtype=np.int8)])
+@pytest.mark.parametrize(
+    "operation",
+    [
+        lambda df: df.where(df < 2, 2),
+        lambda df: df.nans_to_nulls(),
+        lambda df: df.isna(),
+        lambda df: df.notna(),
+        lambda df: abs(df),
+        lambda df: -df,
+        lambda df: ~df,
+    ],
+)
+def test_op_preserves_column_metadata(column, operation):
+    df = cudf.DataFrame([1], columns=cudf.Index(column))
+    result = operation(df).columns
+    expected = pd.Index(column)
+    pd.testing.assert_index_equal(result, expected, exact=True)

From c60860dfb3ac78a8439966e0fa5c7282b9988b15 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 3 May 2024 09:03:02 -0400
Subject: [PATCH 156/272] Fix make_offsets_child_column usage in
 cudf::strings::detail::shift (#15630)

Fixes the `cudf::strings::detail::shift()` function to use the correct `make_offsets_child_column` function to support large strings.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15630
---
 cpp/src/strings/copying/shift.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu
index 562ee6a7088..5bba4855390 100644
--- a/cpp/src/strings/copying/shift.cu
+++ b/cpp/src/strings/copying/shift.cu
@@ -19,8 +19,8 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/strings/detail/copying.hpp>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -104,8 +104,8 @@ std::unique_ptr<column> shift(strings_column_view const& input,
   auto const d_input = column_device_view::create(input.parent(), stream);
   auto sizes_itr     = cudf::detail::make_counting_transform_iterator(
     0, output_sizes_fn{*d_input, d_fill_str, offset});
-  auto [offsets_column, total_bytes] =
-    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+  auto [offsets_column, total_bytes] = cudf::strings::detail::make_offsets_child_column(
+    sizes_itr, sizes_itr + input.size(), stream, mr);
   auto offsets_view = offsets_column->view();
 
   // compute the shift-offset for the output characters child column

From 18f2e7a84a03342bf6305f63ae1f8164ffbccd99 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 3 May 2024 09:03:59 -0400
Subject: [PATCH 157/272] Large strings support in MD5 and SHA hashers (#15631)

Updates the hash functions for md5 and sha to support creating large strings results.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15631
---
 cpp/src/hash/md5_hash.cu  |  4 ++--
 cpp/src/hash/sha_hash.cuh | 29 +++++++++++++++--------------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu
index 8f490ada8ff..0b559e8e86c 100644
--- a/cpp/src/hash/md5_hash.cu
+++ b/cpp/src/hash/md5_hash.cu
@@ -309,7 +309,7 @@ std::unique_ptr<column> md5(table_view const& input,
   // Result column allocation and creation
   auto begin = thrust::make_constant_iterator(digest_size);
   auto [offsets_column, bytes] =
-    cudf::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
+    cudf::strings::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
 
   rmm::device_uvector<char> chars(bytes, stream, mr);
   auto d_chars = chars.data();
@@ -322,7 +322,7 @@ std::unique_ptr<column> md5(table_view const& input,
     thrust::make_counting_iterator(0),
     thrust::make_counting_iterator(input.num_rows()),
     [d_chars, device_input = *device_input] __device__(auto row_index) {
-      MD5Hasher hasher(d_chars + (row_index * digest_size));
+      MD5Hasher hasher(d_chars + (static_cast<int64_t>(row_index) * digest_size));
       for (auto const& col : device_input) {
         if (col.is_valid(row_index)) {
           if (col.type().id() == type_id::LIST) {
diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh
index 005578cb2c2..6976241057e 100644
--- a/cpp/src/hash/sha_hash.cuh
+++ b/cpp/src/hash/sha_hash.cuh
@@ -518,7 +518,7 @@ std::unique_ptr<column> sha_hash(table_view const& input,
   // Result column allocation and creation
   auto begin = thrust::make_constant_iterator(Hasher::digest_size);
   auto [offsets_column, bytes] =
-    cudf::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
+    cudf::strings::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
 
   auto chars   = rmm::device_uvector<char>(bytes, stream, mr);
   auto d_chars = chars.data();
@@ -526,19 +526,20 @@ std::unique_ptr<column> sha_hash(table_view const& input,
   auto const device_input = table_device_view::create(input, stream);
 
   // Hash each row, hashing each element sequentially left to right
-  thrust::for_each(rmm::exec_policy(stream),
-                   thrust::make_counting_iterator(0),
-                   thrust::make_counting_iterator(input.num_rows()),
-                   [d_chars, device_input = *device_input] __device__(auto row_index) {
-                     Hasher hasher(d_chars + (row_index * Hasher::digest_size));
-                     for (auto const& col : device_input) {
-                       if (col.is_valid(row_index)) {
-                         cudf::type_dispatcher<dispatch_storage_type>(
-                           col.type(), HasherDispatcher(&hasher, col), row_index);
-                       }
-                     }
-                     hasher.finalize();
-                   });
+  thrust::for_each(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(input.num_rows()),
+    [d_chars, device_input = *device_input] __device__(auto row_index) {
+      Hasher hasher(d_chars + (static_cast<int64_t>(row_index) * Hasher::digest_size));
+      for (auto const& col : device_input) {
+        if (col.is_valid(row_index)) {
+          cudf::type_dispatcher<dispatch_storage_type>(
+            col.type(), HasherDispatcher(&hasher, col), row_index);
+        }
+      }
+      hasher.finalize();
+    });
 
   return make_strings_column(input.num_rows(), std::move(offsets_column), chars.release(), 0, {});
 }

From 35d77afab14d4d5a5faec321bdb2d87112c07eb2 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 3 May 2024 09:11:27 -0400
Subject: [PATCH 158/272] Use experimental make_strings_children for strings
 convert (#15629)

Updates strings convert functions to use the new experimental `make_strings_children` which supports building large strings.

Reference https://github.com/rapidsai/cudf/issues/15579

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15629
---
 cpp/src/strings/convert/convert_booleans.cu   | 17 +++++++--------
 cpp/src/strings/convert/convert_datetime.cu   | 12 +++++------
 cpp/src/strings/convert/convert_durations.cu  | 21 +++++++++----------
 .../strings/convert/convert_fixed_point.cu    | 16 +++++++-------
 cpp/src/strings/convert/convert_floats.cu     | 15 +++++++------
 cpp/src/strings/convert/convert_hex.cu        | 11 +++++-----
 cpp/src/strings/convert/convert_integers.cu   | 15 +++++++------
 cpp/src/strings/convert/convert_ipv4.cu       | 11 +++++-----
 8 files changed, 56 insertions(+), 62 deletions(-)

diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu
index bf73800ad06..6b64006fa24 100644
--- a/cpp/src/strings/convert/convert_booleans.cu
+++ b/cpp/src/strings/convert/convert_booleans.cu
@@ -16,23 +16,19 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_booleans.hpp>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/traits.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
 
@@ -99,13 +95,14 @@ struct from_booleans_fn {
   column_device_view const d_column;
   string_view d_true;
   string_view d_false;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx) const
   {
     if (d_column.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -113,7 +110,7 @@ struct from_booleans_fn {
       auto const result = d_column.element<bool>(idx) ? d_true : d_false;
       memcpy(d_chars + d_offsets[idx], result.data(), result.size_bytes());
     } else {
-      d_offsets[idx] = d_column.element<bool>(idx) ? d_true.size_bytes() : d_false.size_bytes();
+      d_sizes[idx] = d_column.element<bool>(idx) ? d_true.size_bytes() : d_false.size_bytes();
     }
   };
 };
@@ -143,8 +140,8 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
   // copy null mask
   rmm::device_buffer null_mask = cudf::detail::copy_bitmask(booleans, stream, mr);
 
-  auto [offsets, chars] =
-    make_strings_children(from_booleans_fn{d_column, d_true, d_false}, strings_count, stream, mr);
+  auto [offsets, chars] = experimental::make_strings_children(
+    from_booleans_fn{d_column, d_true, d_false}, strings_count, stream, mr);
 
   return make_strings_column(strings_count,
                              std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index d6449fbb6c8..ddf68eae951 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/convert/convert_datetime.hpp>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -37,7 +37,6 @@
 #include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/logical.h>
@@ -756,8 +755,9 @@ struct datetime_formatter_fn {
   column_device_view const d_timestamps;
   column_device_view const d_format_names;
   device_span<format_item const> const d_format_items;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Specialized modulo expression that handles negative values.
@@ -1087,14 +1087,14 @@ struct datetime_formatter_fn {
   __device__ void operator()(size_type idx) const
   {
     if (d_timestamps.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const tstamp = d_timestamps.element<T>(idx);
     if (d_chars) {
       timestamp_to_string(tstamp, d_chars + d_offsets[idx]);
     } else {
-      d_offsets[idx] = compute_output_size(tstamp);
+      d_sizes[idx] = compute_output_size(tstamp);
     }
   }
 };
@@ -1109,7 +1109,7 @@ struct dispatch_from_timestamps_fn {
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr) const
   {
-    return make_strings_children(
+    return experimental::make_strings_children(
       datetime_formatter_fn<T>{d_timestamps, d_format_names, d_format_items},
       d_timestamps.size(),
       stream,
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 77c750848cf..faf9a83f016 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -17,7 +17,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/convert/int_to_string.cuh>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
@@ -26,10 +26,8 @@
 #include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <thrust/transform.h>
 #include <thrust/transform_reduce.h>
 
@@ -192,8 +190,9 @@ struct from_durations_fn {
   column_device_view d_durations;
   format_item const* d_format_items;
   size_type items_count;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ int8_t format_length(char format_char, duration_component const* const timeparts) const
   {
@@ -378,14 +377,14 @@ struct from_durations_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_durations.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
 
     if (d_chars != nullptr) {
       set_chars(idx);
     } else {
-      d_offsets[idx] = string_size(d_durations.template element<T>(idx));
+      d_sizes[idx] = string_size(d_durations.template element<T>(idx));
     }
   }
 };
@@ -415,11 +414,11 @@ struct dispatch_from_durations_fn {
     // copy null mask
     rmm::device_buffer null_mask = cudf::detail::copy_bitmask(durations, stream, mr);
 
-    auto [offsets, chars] =
-      make_strings_children(from_durations_fn<T>{d_column, d_format_items, compiler.items_count()},
-                            strings_count,
-                            stream,
-                            mr);
+    auto [offsets, chars] = experimental::make_strings_children(
+      from_durations_fn<T>{d_column, d_format_items, compiler.items_count()},
+      strings_count,
+      stream,
+      mr);
 
     return make_strings_column(strings_count,
                                std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index 446baa8dea9..34f81b8b407 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -23,7 +23,7 @@
 #include <cudf/strings/detail/convert/fixed_point.cuh>
 #include <cudf/strings/detail/convert/fixed_point_to_string.cuh>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -37,10 +37,7 @@
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
 #include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
-#include <thrust/generate.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/transform.h>
 
 namespace cudf {
@@ -198,8 +195,9 @@ namespace {
 template <typename DecimalType>
 struct from_fixed_point_fn {
   column_device_view d_decimals;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Converts a decimal element into a string.
@@ -219,13 +217,13 @@ struct from_fixed_point_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_decimals.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
     if (d_chars != nullptr) {
       fixed_point_element_to_string(idx);
     } else {
-      d_offsets[idx] =
+      d_sizes[idx] =
         fixed_point_string_size(d_decimals.element<DecimalType>(idx), d_decimals.type().scale());
     }
   }
@@ -244,8 +242,8 @@ struct dispatch_from_fixed_point_fn {
 
     auto const d_column = column_device_view::create(input, stream);
 
-    auto [offsets, chars] =
-      make_strings_children(from_fixed_point_fn<DecimalType>{*d_column}, input.size(), stream, mr);
+    auto [offsets, chars] = experimental::make_strings_children(
+      from_fixed_point_fn<DecimalType>{*d_column}, input.size(), stream, mr);
 
     return make_strings_column(input.size(),
                                std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index c6061f7d8e6..0ed80b976fd 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -21,7 +21,7 @@
 #include <cudf/strings/convert/convert_floats.hpp>
 #include <cudf/strings/detail/convert/string_to_float.cuh>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -32,9 +32,7 @@
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <thrust/transform.h>
 
 #include <cmath>
@@ -356,8 +354,9 @@ struct ftos_converter {
 template <typename FloatType>
 struct from_floats_fn {
   column_device_view d_floats;
-  size_type* d_offsets;
+  size_type* d_sizes;
   char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ size_type compute_output_size(FloatType value)
   {
@@ -375,13 +374,13 @@ struct from_floats_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_floats.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
     if (d_chars != nullptr) {
       float_to_string(idx);
     } else {
-      d_offsets[idx] = compute_output_size(d_floats.element<FloatType>(idx));
+      d_sizes[idx] = compute_output_size(d_floats.element<FloatType>(idx));
     }
   }
 };
@@ -404,8 +403,8 @@ struct dispatch_from_floats_fn {
     // copy the null mask
     rmm::device_buffer null_mask = cudf::detail::copy_bitmask(floats, stream, mr);
 
-    auto [offsets, chars] =
-      make_strings_children(from_floats_fn<FloatType>{d_column}, strings_count, stream, mr);
+    auto [offsets, chars] = experimental::make_strings_children(
+      from_floats_fn<FloatType>{d_column}, strings_count, stream, mr);
 
     return make_strings_column(strings_count,
                                std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu
index 95af378fc3f..1f9fc3858f8 100644
--- a/cpp/src/strings/convert/convert_hex.cu
+++ b/cpp/src/strings/convert/convert_hex.cu
@@ -19,7 +19,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_integers.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -123,8 +123,9 @@ struct dispatch_hex_to_integers_fn {
 template <typename IntegerType>
 struct integer_to_hex_fn {
   column_device_view const d_column;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void byte_to_hex(uint8_t byte, char* hex)
   {
@@ -141,7 +142,7 @@ struct integer_to_hex_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -167,7 +168,7 @@ struct integer_to_hex_fn {
         --byte_index;
       }
     } else {
-      d_offsets[idx] = static_cast<size_type>(bytes) * 2;  // 2 hex characters per byte
+      d_sizes[idx] = static_cast<size_type>(bytes) * 2;  // 2 hex characters per byte
     }
   }
 };
@@ -181,7 +182,7 @@ struct dispatch_integers_to_hex_fn {
   {
     auto const d_column = column_device_view::create(input, stream);
 
-    auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+    auto [offsets_column, chars] = experimental::make_strings_children(
       integer_to_hex_fn<IntegerType>{*d_column}, input.size(), stream, mr);
 
     return make_strings_column(input.size(),
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index f3e639817a6..918369ead4d 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -23,7 +23,7 @@
 #include <cudf/strings/detail/convert/int_to_string.cuh>
 #include <cudf/strings/detail/convert/string_to_int.cuh>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -34,9 +34,7 @@
 #include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <thrust/logical.h>
 #include <thrust/pair.h>
 #include <thrust/transform.h>
@@ -314,8 +312,9 @@ namespace {
 template <typename IntegerType>
 struct from_integers_fn {
   column_device_view d_integers;
-  size_type* d_offsets;
+  size_type* d_sizes;
   char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Converts an integer element into a string.
@@ -334,13 +333,13 @@ struct from_integers_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_integers.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
     if (d_chars != nullptr) {
       integer_element_to_string(idx);
     } else {
-      d_offsets[idx] = count_digits(d_integers.element<IntegerType>(idx));
+      d_sizes[idx] = count_digits(d_integers.element<IntegerType>(idx));
     }
   }
 };
@@ -363,8 +362,8 @@ struct dispatch_from_integers_fn {
     // copy the null mask
     rmm::device_buffer null_mask = cudf::detail::copy_bitmask(integers, stream, mr);
 
-    auto [offsets, chars] =
-      make_strings_children(from_integers_fn<IntegerType>{d_column}, strings_count, stream, mr);
+    auto [offsets, chars] = experimental::make_strings_children(
+      from_integers_fn<IntegerType>{d_column}, strings_count, stream, mr);
 
     return make_strings_column(strings_count,
                                std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index 3d259f0ab82..33f6c553001 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -20,7 +20,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_ipv4.hpp>
 #include <cudf/strings/detail/convert/int_to_string.cuh>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -124,13 +124,14 @@ namespace {
  */
 struct integers_to_ipv4_fn {
   column_device_view const d_column;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -151,7 +152,7 @@ struct integers_to_ipv4_fn {
       shift_bits -= 8;
     }
 
-    if (!d_chars) { d_offsets[idx] = bytes; }
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -167,7 +168,7 @@ std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
   CUDF_EXPECTS(integers.type().id() == type_id::INT64, "Input column must be type_id::INT64 type");
 
   auto d_column                = column_device_view::create(integers, stream);
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = experimental::make_strings_children(
     integers_to_ipv4_fn{*d_column}, integers.size(), stream, mr);
 
   return make_strings_column(integers.size(),

From b8503bc000f19b983b19292b16f0048254f2b3a9 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Fri, 3 May 2024 07:44:08 -0700
Subject: [PATCH 159/272] Add support for large string columns to Parquet
 reader and writer (#15632)

Part of #13733.

Adds support for reading and writing cuDF string columns where the string data exceeds 2GB. This is accomplished by skipping the final offsets calculation in the string decoding kernel when the 2GB threshold is exceeded, and instead uses `cudf::strings::detail::make_offsets_child_column()`.  This could lead to increased overhead with many columns (see #13024), so this will need some more benchmarking. But if there are many columns that exceed the 2GB limit, it's likely reads will have to be chunked to stay within the memory budget.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - David Wendt (https://github.com/davidwendt)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15632
---
 cpp/CMakeLists.txt                            |  1 +
 cpp/src/io/parquet/page_delta_decode.cu       | 34 +++++----
 cpp/src/io/parquet/page_string_decode.cu      | 33 +++++----
 cpp/src/io/parquet/parquet_gpu.hpp            |  8 +-
 cpp/src/io/parquet/reader_impl.cpp            | 29 ++++++--
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  8 +-
 cpp/src/io/parquet/writer_impl.cu             |  6 +-
 cpp/src/io/utilities/column_buffer.cpp        | 10 ---
 cpp/src/io/utilities/column_buffer_strings.cu | 53 ++++++++++++++
 cpp/tests/CMakeLists.txt                      |  2 +-
 cpp/tests/large_strings/parquet_tests.cpp     | 73 +++++++++++++++++++
 11 files changed, 200 insertions(+), 57 deletions(-)
 create mode 100644 cpp/src/io/utilities/column_buffer_strings.cu
 create mode 100644 cpp/tests/large_strings/parquet_tests.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 232a4f40d8e..f11f3fc3c9a 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -430,6 +430,7 @@ add_library(
   src/io/text/multibyte_split.cu
   src/io/utilities/arrow_io_source.cpp
   src/io/utilities/column_buffer.cpp
+  src/io/utilities/column_buffer_strings.cu
   src/io/utilities/config_utils.cpp
   src/io/utilities/data_casting.cu
   src/io/utilities/data_sink.cpp
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
index da1bbaebd73..0c9d4e77f0c 100644
--- a/cpp/src/io/parquet/page_delta_decode.cu
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -579,15 +579,18 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     __syncthreads();
   }
 
-  // now turn array of lengths into offsets
-  int value_count = nesting_info_base[leaf_level_index].value_count;
+  // Now turn the array of lengths into offsets, but skip if this is a large string column. In the
+  // latter case, offsets will be computed during string column creation.
+  if (not s->col.is_large_string_col) {
+    int value_count = nesting_info_base[leaf_level_index].value_count;
 
-  // if no repetition we haven't calculated start/end bounds and instead just skipped
-  // values until we reach first_row. account for that here.
-  if (!has_repetition) { value_count -= s->first_row; }
+    // if no repetition we haven't calculated start/end bounds and instead just skipped
+    // values until we reach first_row. account for that here.
+    if (!has_repetition) { value_count -= s->first_row; }
 
-  auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
-  block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+    auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
+    block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+  }
 
   if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
@@ -738,15 +741,18 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     __syncthreads();
   }
 
-  // now turn array of lengths into offsets
-  int value_count = nesting_info_base[leaf_level_index].value_count;
+  // Now turn the array of lengths into offsets, but skip if this is a large string column. In the
+  // latter case, offsets will be computed during string column creation.
+  if (not s->col.is_large_string_col) {
+    int value_count = nesting_info_base[leaf_level_index].value_count;
 
-  // if no repetition we haven't calculated start/end bounds and instead just skipped
-  // values until we reach first_row. account for that here.
-  if (!has_repetition) { value_count -= s->first_row; }
+    // if no repetition we haven't calculated start/end bounds and instead just skipped
+    // values until we reach first_row. account for that here.
+    if (!has_repetition) { value_count -= s->first_row; }
 
-  auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
-  block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+    auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
+    block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+  }
 
   // finally, copy the string data into place
   auto const dst = nesting_info_base[leaf_level_index].string_out;
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 5ba813f518f..cf1dc58b06a 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -955,7 +955,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
 {
   using cudf::detail::warp_size;
   __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(4) size_type last_offset;
+  __shared__ size_t last_offset;
   __shared__ __align__(16)
     page_state_buffers_s<rolling_buf_size, rolling_buf_size, rolling_buf_size>
       state_buffers;
@@ -1054,9 +1054,9 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
                               ? gpuGetStringData(s, sb, src_pos + skipped_leaf_values + i)
                               : cuda::std::pair<char const*, size_t>{nullptr, 0};
 
-          __shared__ cub::WarpScan<size_type>::TempStorage temp_storage;
-          size_type offset, warp_total;
-          cub::WarpScan<size_type>(temp_storage).ExclusiveSum(len, offset, warp_total);
+          __shared__ cub::WarpScan<size_t>::TempStorage temp_storage;
+          size_t offset, warp_total;
+          cub::WarpScan<size_t>(temp_storage).ExclusiveSum(len, offset, warp_total);
           offset += last_offset;
 
           // choose a character parallel string copy when the average string is longer than a warp
@@ -1075,10 +1075,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
             }
             __syncwarp();
           } else if (use_char_ll) {
-            __shared__ __align__(8) uint8_t const* pointers[warp_size];
-            __shared__ __align__(4) size_type offsets[warp_size];
-            __shared__ __align__(4) int dsts[warp_size];
-            __shared__ __align__(4) int lengths[warp_size];
+            __shared__ uint8_t const* pointers[warp_size];
+            __shared__ size_t offsets[warp_size];
+            __shared__ int dsts[warp_size];
+            __shared__ int lengths[warp_size];
 
             offsets[me]  = offset;
             pointers[me] = reinterpret_cast<uint8_t const*>(ptr);
@@ -1119,15 +1119,18 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     __syncthreads();
   }
 
-  // now turn array of lengths into offsets
-  int value_count = nesting_info_base[leaf_level_index].value_count;
+  // Now turn the array of lengths into offsets, but skip if this is a large string column. In the
+  // latter case, offsets will be computed during string column creation.
+  if (not s->col.is_large_string_col) {
+    int value_count = nesting_info_base[leaf_level_index].value_count;
 
-  // if no repetition we haven't calculated start/end bounds and instead just skipped
-  // values until we reach first_row. account for that here.
-  if (!has_repetition) { value_count -= s->first_row; }
+    // if no repetition we haven't calculated start/end bounds and instead just skipped
+    // values until we reach first_row. account for that here.
+    if (!has_repetition) { value_count -= s->first_row; }
 
-  auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
-  block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+    auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
+    block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+  }
 
   if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index c06fb63acda..3b18175dccd 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -326,8 +326,8 @@ struct PageInfo {
   int32_t skipped_leaf_values;
   // for string columns only, the size of all the chars in the string for
   // this page. only valid/computed during the base preprocess pass
+  size_t str_offset;  // offset into string data for this page
   int32_t str_bytes;
-  int32_t str_offset;   // offset into string data for this page
   bool has_page_index;  // true if str_bytes, num_valids, etc are derivable from page indexes
 
   // nesting information (input/output) for each page. this array contains
@@ -420,7 +420,8 @@ struct ColumnChunkDesc {
       src_col_schema(src_col_schema_),
       h_chunk_info(chunk_info_),
       list_bytes_per_row_est(list_bytes_per_row_est_),
-      is_strings_to_cat(strings_to_categorical_)
+      is_strings_to_cat(strings_to_categorical_),
+      is_large_string_col(false)
   {
   }
 
@@ -454,7 +455,8 @@ struct ColumnChunkDesc {
 
   float list_bytes_per_row_est{};  // for LIST columns, an estimate on number of bytes per row
 
-  bool is_strings_to_cat{};  // convert strings to hashes
+  bool is_strings_to_cat{};    // convert strings to hashes
+  bool is_large_string_col{};  // `true` if string data uses 64-bit offsets
 };
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index b7172f5ba67..0602b5ec007 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -22,6 +22,7 @@
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/strings/detail/utilities.hpp>
 
 #include <rmm/resource_ref.hpp>
 
@@ -99,11 +100,21 @@ void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_row
     col_string_sizes = calculate_page_string_offsets();
 
     // check for overflow
-    if (std::any_of(col_string_sizes.cbegin(), col_string_sizes.cend(), [](std::size_t sz) {
-          return sz > std::numeric_limits<size_type>::max();
-        })) {
+    auto const threshold         = static_cast<size_t>(strings::detail::get_offset64_threshold());
+    auto const has_large_strings = std::any_of(col_string_sizes.cbegin(),
+                                               col_string_sizes.cend(),
+                                               [=](std::size_t sz) { return sz > threshold; });
+    if (has_large_strings and not strings::detail::is_large_strings_enabled()) {
       CUDF_FAIL("String column exceeds the column size limit", std::overflow_error);
     }
+
+    // mark any chunks that are large string columns
+    if (has_large_strings) {
+      for (auto& chunk : pass.chunks) {
+        auto const idx = chunk.src_col_index;
+        if (col_string_sizes[idx] > threshold) { chunk.is_large_string_col = true; }
+      }
+    }
   }
 
   // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector
@@ -348,11 +359,13 @@ void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_row
       } else if (out_buf.type.id() == type_id::STRING) {
         // need to cap off the string offsets column
         auto const sz = static_cast<size_type>(col_string_sizes[idx]);
-        CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast<size_type*>(out_buf.data()) + out_buf.size,
-                                      &sz,
-                                      sizeof(size_type),
-                                      cudaMemcpyDefault,
-                                      _stream.value()));
+        if (sz <= strings::detail::get_offset64_threshold()) {
+          CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast<size_type*>(out_buf.data()) + out_buf.size,
+                                        &sz,
+                                        sizeof(size_type),
+                                        cudaMemcpyDefault,
+                                        _stream.value()));
+        }
       }
     }
   }
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 4b7a64ac6ab..8c9b3c1a1e6 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1169,10 +1169,10 @@ struct page_to_string_size {
 struct page_offset_output_iter {
   PageInfo* p;
 
-  using value_type        = size_type;
-  using difference_type   = size_type;
-  using pointer           = size_type*;
-  using reference         = size_type&;
+  using value_type        = size_t;
+  using difference_type   = size_t;
+  using pointer           = size_t*;
+  using reference         = size_t&;
   using iterator_category = thrust::output_device_iterator_tag;
 
   __host__ __device__ page_offset_output_iter operator+(int i) { return {p + i}; }
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 286c7b361a9..24aa630a05f 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -40,6 +40,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/detail/dremel.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
@@ -278,8 +279,9 @@ size_t column_size(column_view const& column, rmm::cuda_stream_view stream)
     return size_of(column.type()) * column.size();
   } else if (column.type().id() == type_id::STRING) {
     auto const scol = strings_column_view(column);
-    return cudf::detail::get_value<size_type>(scol.offsets(), column.size(), stream) -
-           cudf::detail::get_value<size_type>(scol.offsets(), 0, stream);
+    return cudf::strings::detail::get_offset_value(
+             scol.offsets(), column.size() + column.offset(), stream) -
+           cudf::strings::detail::get_offset_value(scol.offsets(), column.offset(), stream);
   } else if (column.type().id() == type_id::STRUCT) {
     auto const scol = structs_column_view(column);
     size_t ret      = 0;
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 5dc2291abdc..db84778edc6 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -69,16 +69,6 @@ void cudf::io::detail::inline_column_buffer::create_string_data(size_t num_bytes
   _string_data = rmm::device_buffer(num_bytes, stream, _mr);
 }
 
-std::unique_ptr<column> cudf::io::detail::inline_column_buffer::make_string_column_impl(
-  rmm::cuda_stream_view stream)
-{
-  // no need for copies, just transfer ownership of the data_buffers to the columns
-  auto offsets_col = std::make_unique<column>(
-    data_type{type_to_id<size_type>()}, size + 1, std::move(_data), rmm::device_buffer{}, 0);
-  return make_strings_column(
-    size, std::move(offsets_col), std::move(_string_data), null_count(), std::move(_null_mask));
-}
-
 namespace {
 
 /**
diff --git a/cpp/src/io/utilities/column_buffer_strings.cu b/cpp/src/io/utilities/column_buffer_strings.cu
new file mode 100644
index 00000000000..4bc303a34a5
--- /dev/null
+++ b/cpp/src/io/utilities/column_buffer_strings.cu
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "column_buffer.hpp"
+
+#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/utilities/error.hpp>
+
+namespace cudf::io::detail {
+
+std::unique_ptr<column> cudf::io::detail::inline_column_buffer::make_string_column_impl(
+  rmm::cuda_stream_view stream)
+{
+  // if the size of _string_data is over the threshold for 64bit size_type, _data will contain
+  // sizes rather than offsets. need special handling for that case.
+  auto const threshold = static_cast<size_t>(strings::detail::get_offset64_threshold());
+  if (_string_data.size() > threshold) {
+    if (not strings::detail::is_large_strings_enabled()) {
+      CUDF_FAIL("String column exceeds the column size limit", std::overflow_error);
+    }
+    // create new offsets
+    auto const offsets_ptr = static_cast<size_type*>(_data.data());
+    auto offsets_col       = make_numeric_column(
+      data_type{type_id::INT64}, size + 1, mask_state::UNALLOCATED, stream, _mr);
+    auto d_offsets64 = offsets_col->mutable_view().template data<int64_t>();
+    // it's safe to call with size + 1 because _data is also sized that large
+    cudf::detail::sizes_to_offsets(offsets_ptr, offsets_ptr + size + 1, d_offsets64, stream);
+    return make_strings_column(
+      size, std::move(offsets_col), std::move(_string_data), null_count(), std::move(_null_mask));
+  } else {
+    // no need for copies, just transfer ownership of the data_buffers to the columns
+    auto offsets_col = std::make_unique<column>(
+      data_type{type_to_id<size_type>()}, size + 1, std::move(_data), rmm::device_buffer{}, 0);
+    return make_strings_column(
+      size, std::move(offsets_col), std::move(_string_data), null_count(), std::move(_null_mask));
+  }
+}
+
+}  // namespace cudf::io::detail
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index fa633dfa67b..bbb919aa2d1 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -572,7 +572,7 @@ ConfigureTest(
 # * large strings test ----------------------------------------------------------------------------
 ConfigureTest(
   LARGE_STRINGS_TEST large_strings/large_strings_fixture.cpp large_strings/merge_tests.cpp
-  large_strings/concatenate_tests.cpp
+  large_strings/concatenate_tests.cpp large_strings/parquet_tests.cpp
   GPUS 1
   PERCENT 100
 )
diff --git a/cpp/tests/large_strings/parquet_tests.cpp b/cpp/tests/large_strings/parquet_tests.cpp
new file mode 100644
index 00000000000..007c08ce0fb
--- /dev/null
+++ b/cpp/tests/large_strings/parquet_tests.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+
+#include <cudf/io/parquet.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/table/table_view.hpp>
+
+namespace {
+
+cudf::test::TempDirTestEnvironment* const g_temp_env =
+  static_cast<cudf::test::TempDirTestEnvironment*>(
+    ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
+
+}  // namespace
+
+struct ParquetStringsTest : public cudf::test::StringsLargeTest {};
+
+TEST_F(ParquetStringsTest, ReadLargeStrings)
+{
+  // need to create a string column larger than `threshold`
+  auto const col0        = this->long_column();
+  auto const column_size = cudf::strings_column_view(col0).chars_size(cudf::get_default_stream());
+  auto const threshold   = column_size - 1;
+  auto const expected    = cudf::table_view{{col0, col0, col0}};
+
+  auto expected_metadata = cudf::io::table_input_metadata{expected};
+  expected_metadata.column_metadata[1].set_encoding(
+    cudf::io::column_encoding::DELTA_LENGTH_BYTE_ARRAY);
+  expected_metadata.column_metadata[2].set_encoding(cudf::io::column_encoding::DELTA_BYTE_ARRAY);
+
+  // set smaller threshold to reduce file size and execution time
+  setenv("LIBCUDF_LARGE_STRINGS_THRESHOLD", std::to_string(threshold).c_str(), 1);
+
+  auto const filepath = g_temp_env->get_temp_filepath("ReadLargeStrings.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .compression(cudf::io::compression_type::ZSTD)
+      .stats_level(cudf::io::STATISTICS_NONE)
+      .metadata(expected_metadata);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options default_in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto const result      = cudf::io::read_parquet(default_in_opts);
+  auto const result_view = result.tbl->view();
+  for (auto cv : result_view) {
+    auto const offsets = cudf::strings_column_view(cv).offsets();
+    EXPECT_EQ(offsets.type(), cudf::data_type{cudf::type_id::INT64});
+  }
+  CUDF_TEST_EXPECT_TABLES_EQUAL(result_view, expected);
+
+  // go back to normal threshold
+  unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
+}

From ce6902f064e9c028aa97c4a7ec5f2eed1c0c9a90 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 3 May 2024 08:21:25 -1000
Subject: [PATCH 160/272] Move timezone conversion logic to `DatetimeColumn`
 (#15545)

Moves methods/logic in `python/cudf/cudf/core/_internals/timezones.py` to the newly created `DatetimeColumn.tz_localize` and `DatetimeColumn.tz_convert`.

Additionally adds typing and improves an error message when doing `tz_convert(None)` on a tz-naive Series/Index to raise a `TypeError` (like pandas) instead of an `AttributeError`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15545
---
 python/cudf/cudf/core/_internals/timezones.py | 201 +++---------------
 python/cudf/cudf/core/column/datetime.py      | 159 +++++++++++++-
 python/cudf/cudf/core/index.py                |  23 +-
 python/cudf/cudf/core/series.py               |  27 +--
 python/cudf/cudf/core/tools/datetimes.py      |   5 +-
 .../cudf/tests/series/test_datetimelike.py    |   5 +
 6 files changed, 206 insertions(+), 214 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py
index 4888cdd9ac9..f04cae719c2 100644
--- a/python/cudf/cudf/core/_internals/timezones.py
+++ b/python/cudf/cudf/core/_internals/timezones.py
@@ -3,23 +3,18 @@
 import os
 import zoneinfo
 from functools import lru_cache
-from typing import Tuple, cast
+from typing import Literal, Tuple
 
 import numpy as np
-import pandas as pd
 
-import cudf
-from cudf._lib.labeling import label_bins
-from cudf._lib.search import search_sorted
 from cudf._lib.timezone import make_timezone_transition_table
-from cudf.core.column.column import as_column, build_column
-from cudf.core.column.datetime import DatetimeColumn, DatetimeTZColumn
-from cudf.core.dataframe import DataFrame
-from cudf.utils.dtypes import _get_base_dtype
+from cudf.core.column.column import as_column
+from cudf.core.column.datetime import DatetimeColumn
+from cudf.core.column.timedelta import TimeDeltaColumn
 
 
 @lru_cache(maxsize=20)
-def get_tz_data(zone_name):
+def get_tz_data(zone_name: str) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
     """
     Return timezone data (transition times and UTC offsets) for the
     given IANA time zone.
@@ -31,8 +26,8 @@ def get_tz_data(zone_name):
 
     Returns
     -------
-    DataFrame with two columns containing the transition times
-    ("transition_times") and corresponding UTC offsets ("offsets").
+    Tuple with two columns containing the transition times
+    and corresponding UTC offsets.
     """
     try:
         # like zoneinfo, we first look in TZPATH
@@ -43,19 +38,23 @@ def get_tz_data(zone_name):
     return tz_table
 
 
-def _find_and_read_tzfile_tzpath(zone_name):
+def _find_and_read_tzfile_tzpath(
+    zone_name: str,
+) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
     for search_path in zoneinfo.TZPATH:
         if os.path.isfile(os.path.join(search_path, zone_name)):
-            return _read_tzfile_as_frame(search_path, zone_name)
+            return _read_tzfile_as_columns(search_path, zone_name)
     raise zoneinfo.ZoneInfoNotFoundError(zone_name)
 
 
-def _find_and_read_tzfile_tzdata(zone_name):
+def _find_and_read_tzfile_tzdata(
+    zone_name: str,
+) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
     import importlib.resources
 
     package_base = "tzdata.zoneinfo"
     try:
-        return _read_tzfile_as_frame(
+        return _read_tzfile_as_columns(
             str(importlib.resources.files(package_base)), zone_name
         )
     # TODO: make it so that the call to libcudf raises a
@@ -77,7 +76,9 @@ def _find_and_read_tzfile_tzdata(zone_name):
         raise zoneinfo.ZoneInfoNotFoundError(zone_name)
 
 
-def _read_tzfile_as_frame(tzdir, zone_name):
+def _read_tzfile_as_columns(
+    tzdir, zone_name: str
+) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
     transition_times_and_offsets = make_timezone_transition_table(
         tzdir, zone_name
     )
@@ -85,91 +86,13 @@ def _read_tzfile_as_frame(tzdir, zone_name):
     if not transition_times_and_offsets:
         # this happens for UTC-like zones
         min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]")
-        transition_times_and_offsets = (
-            as_column([min_date]),
-            as_column([np.timedelta64(0, "s")]),
-        )
-
-    return DataFrame._from_data(
-        dict(
-            zip(["transition_times", "offsets"], transition_times_and_offsets)
-        )
-    )
-
+        return (as_column([min_date]), as_column([np.timedelta64(0, "s")]))
+    return tuple(transition_times_and_offsets)  # type: ignore[return-value]
 
-def _find_ambiguous_and_nonexistent(
-    data: DatetimeColumn, zone_name: str
-) -> Tuple:
-    """
-    Recognize ambiguous and nonexistent timestamps for the given timezone.
-
-    Returns a tuple of columns, both of "bool" dtype and of the same
-    size as `data`, that respectively indicate ambiguous and
-    nonexistent timestamps in `data` with the value `True`.
-
-    Ambiguous and/or nonexistent timestamps are only possible if any
-    transitions occur in the time zone database for the given timezone.
-    If no transitions occur, the tuple `(False, False)` is returned.
-    """
-    tz_data_for_zone = get_tz_data(zone_name)
-    transition_times = tz_data_for_zone["transition_times"]
-    offsets = tz_data_for_zone["offsets"].astype(
-        f"timedelta64[{data.time_unit}]"
-    )
 
-    if len(offsets) == 1:  # no transitions
-        return False, False
-
-    transition_times, offsets, old_offsets = (
-        transition_times[1:]._column,
-        offsets[1:]._column,
-        offsets[:-1]._column,
-    )
-
-    # Assume we have two clocks at the moment of transition:
-    # - Clock 1 is turned forward or backwards correctly
-    # - Clock 2 makes no changes
-    clock_1 = transition_times + offsets
-    clock_2 = transition_times + old_offsets
-
-    # At the start of an ambiguous time period, Clock 1 (which has
-    # been turned back) reads less than Clock 2:
-    cond = clock_1 < clock_2
-    ambiguous_begin = clock_1.apply_boolean_mask(cond)
-
-    # The end of an ambiguous time period is what Clock 2 reads at
-    # the moment of transition:
-    ambiguous_end = clock_2.apply_boolean_mask(cond)
-    ambiguous = label_bins(
-        data,
-        left_edges=ambiguous_begin,
-        left_inclusive=True,
-        right_edges=ambiguous_end,
-        right_inclusive=False,
-    ).notnull()
-
-    # At the start of a non-existent time period, Clock 2 reads less
-    # than Clock 1 (which has been turned forward):
-    cond = clock_1 > clock_2
-    nonexistent_begin = clock_2.apply_boolean_mask(cond)
-
-    # The end of the non-existent time period is what Clock 1 reads
-    # at the moment of transition:
-    nonexistent_end = clock_1.apply_boolean_mask(cond)
-    nonexistent = label_bins(
-        data,
-        left_edges=nonexistent_begin,
-        left_inclusive=True,
-        right_edges=nonexistent_end,
-        right_inclusive=False,
-    ).notnull()
-
-    return ambiguous, nonexistent
-
-
-def localize(
-    data: DatetimeColumn, zone_name: str, ambiguous, nonexistent
-) -> DatetimeTZColumn:
+def check_ambiguous_and_nonexistent(
+    ambiguous: Literal["NaT"], nonexistent: Literal["NaT"]
+) -> Tuple[Literal["NaT"], Literal["NaT"]]:
     if ambiguous != "NaT":
         raise NotImplementedError(
             "Only ambiguous='NaT' is currently supported"
@@ -178,80 +101,4 @@ def localize(
         raise NotImplementedError(
             "Only nonexistent='NaT' is currently supported"
         )
-    if isinstance(data, DatetimeTZColumn):
-        raise ValueError(
-            "Already localized. "
-            "Use `tz_convert` to convert between time zones."
-        )
-    dtype = pd.DatetimeTZDtype(data.time_unit, zone_name)
-    ambiguous, nonexistent = _find_ambiguous_and_nonexistent(data, zone_name)
-    localized = cast(
-        DatetimeColumn,
-        data._scatter_by_column(
-            data.isnull() | (ambiguous | nonexistent),
-            cudf.Scalar(cudf.NaT, dtype=data.dtype),
-        ),
-    )
-    gmt_data = local_to_utc(localized, zone_name)
-    return cast(
-        DatetimeTZColumn,
-        build_column(
-            data=gmt_data.base_data,
-            dtype=dtype,
-            mask=localized.base_mask,
-            size=gmt_data.size,
-            offset=gmt_data.offset,
-        ),
-    )
-
-
-def delocalize(data: DatetimeColumn) -> DatetimeColumn:
-    """
-    Convert a timezone-aware datetime column to a timezone-naive one.
-    If the column is already timezone-naive, return it as is.
-    """
-    if isinstance(data, DatetimeTZColumn):
-        return data._local_time
-    # already timezone-naive:
-    return data
-
-
-def convert(data: DatetimeTZColumn, zone_name: str) -> DatetimeTZColumn:
-    if not isinstance(data, DatetimeTZColumn):
-        raise TypeError(
-            "Cannot convert from timezone-naive timestamps to "
-            "timezone-aware timestamps. For that, "
-            "use `tz_localize`."
-        )
-    if zone_name == str(data.dtype.tz):
-        return data.copy()
-    utc_time = data._utc_time
-    out = cast(
-        DatetimeTZColumn,
-        build_column(
-            data=utc_time.base_data,
-            dtype=pd.DatetimeTZDtype(data.time_unit, zone_name),
-            mask=utc_time.base_mask,
-            size=utc_time.size,
-            offset=utc_time.offset,
-        ),
-    )
-    return out
-
-
-def utc_to_local(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
-    tz_data_for_zone = get_tz_data(zone_name)
-    transition_times, offsets = tz_data_for_zone._columns
-    transition_times = transition_times.astype(_get_base_dtype(data.dtype))
-    indices = search_sorted([transition_times], [data], "right") - 1
-    offsets_from_utc = offsets.take(indices, nullify=True)
-    return data + offsets_from_utc
-
-
-def local_to_utc(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
-    tz_data_for_zone = get_tz_data(zone_name)
-    transition_times, offsets = tz_data_for_zone._columns
-    transition_times_local = (transition_times + offsets).astype(data.dtype)
-    indices = search_sorted([transition_times_local], [data], "right") - 1
-    offsets_to_utc = offsets.take(indices, nullify=True)
-    return data - offsets_to_utc
+    return ambiguous, nonexistent
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 981ef738458..9fe4e5da96d 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -7,7 +7,7 @@
 import locale
 import re
 from locale import nl_langinfo
-from typing import Any, Optional, Sequence, cast
+from typing import TYPE_CHECKING, Any, Literal, Optional, Sequence, Tuple, cast
 
 import numpy as np
 import pandas as pd
@@ -16,6 +16,8 @@
 
 import cudf
 from cudf import _lib as libcudf
+from cudf._lib.labeling import label_bins
+from cudf._lib.search import search_sorted
 from cudf._typing import (
     ColumnBinaryOperand,
     DatetimeLikeScalar,
@@ -31,6 +33,9 @@
 from cudf.utils.dtypes import _get_base_dtype
 from cudf.utils.utils import _all_bools_with_nulls
 
+if TYPE_CHECKING:
+    from cudf.core.column.numerical import NumericalColumn
+
 if PANDAS_GE_220:
     _guess_datetime_format = pd.tseries.api.guess_datetime_format
 else:
@@ -665,6 +670,121 @@ def _with_type_metadata(self, dtype):
             )
         return self
 
+    def _find_ambiguous_and_nonexistent(
+        self, zone_name: str
+    ) -> Tuple[NumericalColumn, NumericalColumn] | Tuple[bool, bool]:
+        """
+        Recognize ambiguous and nonexistent timestamps for the given timezone.
+
+        Returns a tuple of columns, both of "bool" dtype and of the same
+        size as `self`, that respectively indicate ambiguous and
+        nonexistent timestamps in `self` with the value `True`.
+
+        Ambiguous and/or nonexistent timestamps are only possible if any
+        transitions occur in the time zone database for the given timezone.
+        If no transitions occur, the tuple `(False, False)` is returned.
+        """
+        from cudf.core._internals.timezones import get_tz_data
+
+        transition_times, offsets = get_tz_data(zone_name)
+        offsets = offsets.astype(f"timedelta64[{self.time_unit}]")  # type: ignore[assignment]
+
+        if len(offsets) == 1:  # no transitions
+            return False, False
+
+        transition_times, offsets, old_offsets = (
+            transition_times.slice(1, len(transition_times)),
+            offsets.slice(1, len(offsets)),
+            offsets.slice(0, len(offsets) - 1),
+        )
+
+        # Assume we have two clocks at the moment of transition:
+        # - Clock 1 is turned forward or backwards correctly
+        # - Clock 2 makes no changes
+        clock_1 = transition_times + offsets
+        clock_2 = transition_times + old_offsets
+
+        # At the start of an ambiguous time period, Clock 1 (which has
+        # been turned back) reads less than Clock 2:
+        cond = clock_1 < clock_2
+        ambiguous_begin = clock_1.apply_boolean_mask(cond)
+
+        # The end of an ambiguous time period is what Clock 2 reads at
+        # the moment of transition:
+        ambiguous_end = clock_2.apply_boolean_mask(cond)
+        ambiguous = label_bins(
+            self,
+            left_edges=ambiguous_begin,
+            left_inclusive=True,
+            right_edges=ambiguous_end,
+            right_inclusive=False,
+        ).notnull()
+
+        # At the start of a non-existent time period, Clock 2 reads less
+        # than Clock 1 (which has been turned forward):
+        cond = clock_1 > clock_2
+        nonexistent_begin = clock_2.apply_boolean_mask(cond)
+
+        # The end of the non-existent time period is what Clock 1 reads
+        # at the moment of transition:
+        nonexistent_end = clock_1.apply_boolean_mask(cond)
+        nonexistent = label_bins(
+            self,
+            left_edges=nonexistent_begin,
+            left_inclusive=True,
+            right_edges=nonexistent_end,
+            right_inclusive=False,
+        ).notnull()
+
+        return ambiguous, nonexistent
+
+    def tz_localize(
+        self,
+        tz: str | None,
+        ambiguous: Literal["NaT"] = "NaT",
+        nonexistent: Literal["NaT"] = "NaT",
+    ):
+        from cudf.core._internals.timezones import (
+            check_ambiguous_and_nonexistent,
+            get_tz_data,
+        )
+
+        if tz is None:
+            return self.copy()
+        ambiguous, nonexistent = check_ambiguous_and_nonexistent(
+            ambiguous, nonexistent
+        )
+        dtype = pd.DatetimeTZDtype(self.time_unit, tz)
+        ambiguous_col, nonexistent_col = self._find_ambiguous_and_nonexistent(
+            tz
+        )
+        localized = self._scatter_by_column(
+            self.isnull() | (ambiguous_col | nonexistent_col),
+            cudf.Scalar(cudf.NaT, dtype=self.dtype),
+        )
+
+        transition_times, offsets = get_tz_data(tz)
+        transition_times_local = (transition_times + offsets).astype(
+            localized.dtype
+        )
+        indices = (
+            search_sorted([transition_times_local], [localized], "right") - 1
+        )
+        offsets_to_utc = offsets.take(indices, nullify=True)
+        gmt_data = localized - offsets_to_utc
+        return DatetimeTZColumn(
+            data=gmt_data.base_data,
+            dtype=dtype,
+            mask=localized.base_mask,
+            size=gmt_data.size,
+            offset=gmt_data.offset,
+        )
+
+    def tz_convert(self, tz: str | None):
+        raise TypeError(
+            "Cannot convert tz-naive timestamps, use tz_localize to localize"
+        )
+
 
 class DatetimeTZColumn(DatetimeColumn):
     def __init__(
@@ -731,9 +851,13 @@ def _utc_time(self):
     @property
     def _local_time(self):
         """Return the local time as naive timestamps."""
-        from cudf.core._internals.timezones import utc_to_local
+        from cudf.core._internals.timezones import get_tz_data
 
-        return utc_to_local(self, str(self.dtype.tz))
+        transition_times, offsets = get_tz_data(str(self.dtype.tz))
+        transition_times = transition_times.astype(_get_base_dtype(self.dtype))
+        indices = search_sorted([transition_times], [self], "right") - 1
+        offsets_from_utc = offsets.take(indices, nullify=True)
+        return self + offsets_from_utc
 
     def as_string_column(
         self, dtype: Dtype, format: str | None = None
@@ -756,3 +880,32 @@ def __repr__(self):
             f"{arr.to_string()}\n"
             f"dtype: {self.dtype}"
         )
+
+    def tz_localize(self, tz: str | None, ambiguous="NaT", nonexistent="NaT"):
+        from cudf.core._internals.timezones import (
+            check_ambiguous_and_nonexistent,
+        )
+
+        if tz is None:
+            return self._local_time
+        ambiguous, nonexistent = check_ambiguous_and_nonexistent(
+            ambiguous, nonexistent
+        )
+        raise ValueError(
+            "Already localized. "
+            "Use `tz_convert` to convert between time zones."
+        )
+
+    def tz_convert(self, tz: str | None):
+        if tz is None:
+            return self._utc_time
+        elif tz == str(self.dtype.tz):
+            return self.copy()
+        utc_time = self._utc_time
+        return type(self)(
+            data=utc_time.base_data,
+            dtype=pd.DatetimeTZDtype(self.time_unit, tz),
+            mask=utc_time.base_mask,
+            size=utc_time.size,
+            offset=utc_time.offset,
+        )
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index f55fa4c05b5..583e5d74b56 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2258,7 +2258,12 @@ def round(self, freq):
 
         return self.__class__._from_data({self.name: out_column})
 
-    def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"):
+    def tz_localize(
+        self,
+        tz: str | None,
+        ambiguous: Literal["NaT"] = "NaT",
+        nonexistent: Literal["NaT"] = "NaT",
+    ):
         """
         Localize timezone-naive data to timezone-aware data.
 
@@ -2300,17 +2305,12 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"):
         ambiguous or nonexistent timestamps are converted
         to 'NaT'.
         """  # noqa: E501
-        from cudf.core._internals.timezones import delocalize, localize
-
-        if tz is None:
-            result_col = delocalize(self._column)
-        else:
-            result_col = localize(self._column, tz, ambiguous, nonexistent)
+        result_col = self._column.tz_localize(tz, ambiguous, nonexistent)
         return DatetimeIndex._from_data(
             {self.name: result_col}, freq=self._freq
         )
 
-    def tz_convert(self, tz):
+    def tz_convert(self, tz: str | None):
         """
         Convert tz-aware datetimes from one time zone to another.
 
@@ -2342,12 +2342,7 @@ def tz_convert(self, tz):
                        '2018-03-03 14:00:00+00:00'],
                       dtype='datetime64[ns, Europe/London]')
         """  # noqa: E501
-        from cudf.core._internals.timezones import convert
-
-        if tz is None:
-            result_col = self._column._utc_time
-        else:
-            result_col = convert(self._column, tz)
+        result_col = self._column.tz_convert(tz)
         return DatetimeIndex._from_data({self.name: result_col})
 
 
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index b6ed28f9093..c3d232aaa7c 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -4755,22 +4755,22 @@ def strftime(self, date_format, *args, **kwargs):
         )
 
     @copy_docstring(DatetimeIndex.tz_localize)
-    def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"):
-        from cudf.core._internals.timezones import delocalize, localize
-
-        if tz is None:
-            result_col = delocalize(self.series._column)
-        else:
-            result_col = localize(
-                self.series._column, tz, ambiguous, nonexistent
-            )
+    def tz_localize(
+        self,
+        tz: str | None,
+        ambiguous: Literal["NaT"] = "NaT",
+        nonexistent: Literal["NaT"] = "NaT",
+    ):
+        result_col = self.series._column.tz_localize(
+            tz, ambiguous, nonexistent
+        )
         return Series._from_data(
             data={self.series.name: result_col},
             index=self.series._index,
         )
 
     @copy_docstring(DatetimeIndex.tz_convert)
-    def tz_convert(self, tz):
+    def tz_convert(self, tz: str | None):
         """
         Parameters
         ----------
@@ -4780,12 +4780,7 @@ def tz_convert(self, tz):
             A `tz` of None will convert to UTC and remove the
             timezone information.
         """
-        from cudf.core._internals.timezones import convert
-
-        if tz is None:
-            result_col = self.series._column._utc_time
-        else:
-            result_col = convert(self.series._column, tz)
+        result_col = self.series._column.tz_convert(tz)
         return Series._from_data(
             {self.series.name: result_col}, index=self.series._index
         )
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 907f3b586d1..7f6ce1100ea 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -317,9 +317,6 @@ def _process_col(
     format: Optional[str],
     utc: bool,
 ):
-    # Causes circular import
-    from cudf.core._internals.timezones import localize
-
     if col.dtype.kind == "f":
         if unit not in (None, "ns"):
             factor = cudf.Scalar(
@@ -396,7 +393,7 @@ def _process_col(
             f"dtype {col.dtype} cannot be converted to {_unit_dtype_map[unit]}"
         )
     if utc and not isinstance(col.dtype, pd.DatetimeTZDtype):
-        return localize(col, "UTC", ambiguous="NaT", nonexistent="NaT")
+        return col.tz_localize("UTC")
     return col
 
 
diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py
index 6ee339ee3ea..7ef55761b2b 100644
--- a/python/cudf/cudf/tests/series/test_datetimelike.py
+++ b/python/cudf/cudf/tests/series/test_datetimelike.py
@@ -218,3 +218,8 @@ def test_contains_tz_aware(item, expected):
     dti = cudf.date_range("2020", periods=2, freq="D").tz_localize("UTC")
     result = item in dti
     assert result == expected
+
+
+def test_tz_convert_naive_typeerror():
+    with pytest.raises(TypeError):
+        cudf.date_range("2020", periods=2, freq="D").tz_convert(None)

From 09f8ff39728b774f1bb8957d76ed3b47e00c3708 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 3 May 2024 14:53:20 -0400
Subject: [PATCH 161/272] Large strings support for cudf::interleave_columns
 (#15544)

Updates the `cudf::interleave_columns` logic to use gather-based `make_strings_column` instead of the `make_strings_children` since the gather-based function already efficiently supports longs.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15544
---
 cpp/benchmarks/CMakeLists.txt         |   5 ++
 cpp/benchmarks/reshape/interleave.cpp |  59 +++++++++++++++
 cpp/src/lists/interleave_columns.cu   | 100 ++++++++++----------------
 cpp/src/reshape/interleave_columns.cu |  95 +++++++++---------------
 4 files changed, 133 insertions(+), 126 deletions(-)
 create mode 100644 cpp/benchmarks/reshape/interleave.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 5fd328dfc68..7e61d881f07 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -346,6 +346,11 @@ target_link_libraries(MULTIBYTE_SPLIT_NVBENCH PRIVATE ZLIB::ZLIB)
 # ---------------------------------------------------------------------------------
 ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp)
 
+# ##################################################################################################
+# * reshape benchmark
+# ---------------------------------------------------------------------------------
+ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp)
+
 add_custom_target(
   run_benchmarks
   DEPENDS CUDF_BENCHMARKS
diff --git a/cpp/benchmarks/reshape/interleave.cpp b/cpp/benchmarks/reshape/interleave.cpp
new file mode 100644
index 00000000000..4499e34af77
--- /dev/null
+++ b/cpp/benchmarks/reshape/interleave.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/reshape.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_interleave(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const num_cols  = static_cast<cudf::size_type>(state.get_int64("columns"));
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) * num_cols >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const str_profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  std::vector<cudf::type_id> types(num_cols, cudf::type_id::STRING);
+  auto const source_table = create_random_table(types, row_count{num_rows}, str_profile);
+
+  auto const source_view = source_table->view();
+  auto const stream      = cudf::get_default_stream();
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto chars_size = cudf::strings_column_view(source_view.column(0)).chars_size(stream) +
+                    cudf::strings_column_view(source_view.column(1)).chars_size(stream);
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);  // all bytes are written
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    [[maybe_unused]] auto result = cudf::interleave_columns(source_view);
+  });
+}
+
+NVBENCH_BENCH(bench_interleave)
+  .set_name("interleave_strings")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
+  .add_int64_axis("columns", {2, 10, 100});
diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu
index 88eccf13f72..be8fad62412 100644
--- a/cpp/src/lists/interleave_columns.cu
+++ b/cpp/src/lists/interleave_columns.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -128,12 +128,20 @@ std::unique_ptr<column> concatenate_and_gather_lists(host_span<column_view const
   return std::move(result->release()[0]);
 }
 
+// Error case when no other overload or specialization is available
+template <typename T, typename Enable = void>
+struct interleave_list_entries_impl {
+  template <typename... Args>
+  std::unique_ptr<column> operator()(Args&&...)
+  {
+    CUDF_FAIL("Called `interleave_list_entries_fn()` on non-supported types.");
+  }
+};
+
 /**
- * @brief Compute string sizes, string validities, and interleave string lists functor.
+ * @brief Interleave array of string_index_pair objects for a list of strings
  *
- * This functor is executed twice. In the first pass, the sizes and validities of the output strings
- * will be computed. In the second pass, this will interleave the lists of strings of the given
- * table containing those lists.
+ * Each thread processes the strings for the corresponding list row
  */
 struct compute_string_sizes_and_interleave_lists_fn {
   table_device_view const table_dv;
@@ -141,19 +149,10 @@ struct compute_string_sizes_and_interleave_lists_fn {
   // Store list offsets of the output lists column.
   size_type const* const dst_list_offsets;
 
-  // Flag to specify whether to compute string validities.
-  bool const has_null_mask;
-
-  // Store offsets of the strings.
-  size_type* d_offsets{nullptr};
-
-  // If d_chars == nullptr: only compute sizes and validities of the output strings.
-  // If d_chars != nullptr: only interleave lists of strings.
-  char* d_chars{nullptr};
-
-  // We need to set `1` or `0` for the validities of the strings in the child column.
-  int8_t* d_validities{nullptr};
+  using string_index_pair = cudf::strings::detail::string_index_pair;
+  string_index_pair* indices;  // output
 
+  // thread per list row per column
   __device__ void operator()(size_type const idx)
   {
     auto const num_cols = table_dv.num_columns();
@@ -161,7 +160,7 @@ struct compute_string_sizes_and_interleave_lists_fn {
     auto const list_id  = idx / num_cols;
 
     auto const& lists_col = table_dv.column(col_id);
-    if (has_null_mask and lists_col.is_null(list_id)) { return; }
+    if (lists_col.is_null(list_id)) { return; }
 
     auto const list_offsets =
       lists_col.child(lists_column_view::offsets_column_index).template data<size_type>() +
@@ -181,65 +180,40 @@ struct compute_string_sizes_and_interleave_lists_fn {
     // read_idx and write_idx are indices of string elements.
     size_type write_idx = dst_list_offsets[idx];
 
-    if (not d_chars) {  // just compute sizes and validities of strings within a list
-      for (auto read_idx = start_str_idx; read_idx < end_str_idx; ++read_idx, ++write_idx) {
-        if (has_null_mask) {
-          d_validities[write_idx] = static_cast<int8_t>(str_col.is_valid(read_idx));
-        }
-        d_offsets[write_idx] = str_offsets[read_idx + 1] - str_offsets[read_idx];
-      }
-    } else {  // just copy the entire memory region containing all strings in the list
-      // start_byte and end_byte are indices of character of the string elements.
-      auto const start_byte = str_offsets[start_str_idx];
-      auto const end_byte   = str_offsets[end_str_idx];
-      if (start_byte < end_byte) {
-        auto const input_ptr  = str_col.template head<char>() + start_byte;
-        auto const output_ptr = d_chars + d_offsets[write_idx];
-        thrust::copy(thrust::seq, input_ptr, input_ptr + end_byte - start_byte, output_ptr);
+    for (auto read_idx = start_str_idx; read_idx < end_str_idx; ++read_idx, ++write_idx) {
+      auto const offset        = str_offsets[read_idx];
+      auto const size          = str_offsets[read_idx + 1] - offset;
+      string_index_pair result = {nullptr, size};
+      if (str_col.is_valid(read_idx)) {
+        result.first = size > 0 ? str_col.template head<char>() + offset : "";
       }
+      indices[write_idx] = result;
     }
   }
 };
 
-// Error case when no other overload or specialization is available
-template <typename T, typename Enable = void>
-struct interleave_list_entries_impl {
-  template <typename... Args>
-  std::unique_ptr<column> operator()(Args&&...)
-  {
-    CUDF_FAIL("Called `interleave_list_entries_fn()` on non-supported types.");
-  }
-};
-
 template <typename T>
 struct interleave_list_entries_impl<T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>> {
   std::unique_ptr<column> operator()(table_view const& input,
                                      column_view const& output_list_offsets,
                                      size_type num_output_lists,
                                      size_type num_output_entries,
-                                     bool data_has_null_mask,
+                                     bool,
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr) const noexcept
   {
-    auto const table_dv_ptr = table_device_view::create(input, stream);
-    auto comp_fn            = compute_string_sizes_and_interleave_lists_fn{
-      *table_dv_ptr, output_list_offsets.template begin<size_type>(), data_has_null_mask};
-
-    auto validities =
-      rmm::device_uvector<int8_t>(data_has_null_mask ? num_output_entries : 0, stream);
-    comp_fn.d_validities = validities.data();
-
-    auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
-      comp_fn, num_output_lists, num_output_entries, stream, mr);
-
-    auto [null_mask, null_count] =
-      cudf::detail::valid_if(validities.begin(), validities.end(), thrust::identity{}, stream, mr);
-
-    return make_strings_column(num_output_entries,
-                               std::move(offsets_column),
-                               chars.release(),
-                               null_count,
-                               std::move(null_mask));
+    auto const table_dv_ptr   = table_device_view::create(input, stream);
+    auto const d_list_offsets = output_list_offsets.template begin<size_type>();
+
+    rmm::device_uvector<cudf::strings::detail::string_index_pair> indices(num_output_entries,
+                                                                          stream);
+    auto comp_fn =
+      compute_string_sizes_and_interleave_lists_fn{*table_dv_ptr, d_list_offsets, indices.data()};
+    thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                       thrust::counting_iterator<size_type>(0),
+                       num_output_lists,
+                       comp_fn);
+    return cudf::strings::detail::make_strings_column(indices.begin(), indices.end(), stream, mr);
   }
 };
 
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index 3d1421120fd..580db0e24c5 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/detail/interleave_columns.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
@@ -140,85 +141,53 @@ struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::struc
   }
 };
 
+struct interleave_strings_fn {
+  using string_index_pair = cudf::strings::detail::string_index_pair;
+  table_device_view d_table;
+
+  __device__ string_index_pair operator()(size_type idx)
+  {
+    auto const num_columns    = d_table.num_columns();
+    auto const source_col_idx = idx % num_columns;
+    auto const source_row_idx = idx / num_columns;
+    auto const col            = d_table.column(source_col_idx);
+    if (col.is_null(source_row_idx)) { return string_index_pair{nullptr, 0}; }
+    auto const d_str = col.element<string_view>(source_row_idx);
+    // ensures an empty string is not identified as a null row
+    return !d_str.empty() ? string_index_pair{d_str.data(), d_str.size_bytes()}
+                          : string_index_pair{"", 0};
+  }
+};
+
 template <typename T>
 struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>> {
   std::unique_ptr<cudf::column> operator()(table_view const& strings_columns,
-                                           bool create_mask,
+                                           bool,
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr)
   {
     auto num_columns = strings_columns.num_columns();
-    if (num_columns == 1)  // Single strings column returns a copy
+    if (num_columns == 1) {  // Single strings column returns a copy
       return std::make_unique<column>(*(strings_columns.begin()), stream, mr);
+    }
 
     auto strings_count = strings_columns.num_rows();
-    if (strings_count == 0)  // All columns have 0 rows
+    if (strings_count == 0) {  // All columns have 0 rows
       return make_empty_column(type_id::STRING);
+    }
 
     // Create device views from the strings columns.
-    auto table       = table_device_view::create(strings_columns, stream);
-    auto d_table     = *table;
+    auto d_table     = table_device_view::create(strings_columns, stream);
     auto num_strings = num_columns * strings_count;
 
-    std::pair<rmm::device_buffer, size_type> valid_mask{};
-    if (create_mask) {
-      // Create resulting null mask
-      valid_mask = cudf::detail::valid_if(
-        thrust::make_counting_iterator<size_type>(0),
-        thrust::make_counting_iterator<size_type>(num_strings),
-        [num_columns, d_table] __device__(size_type idx) {
-          auto source_row_idx = idx % num_columns;
-          auto source_col_idx = idx / num_columns;
-          return !d_table.column(source_row_idx).is_null(source_col_idx);
-        },
-        stream,
-        mr);
-    }
-
-    auto const null_count = valid_mask.second;
-
-    // Build offsets column by computing sizes of each string in the output
-    auto offsets_transformer =
-      cuda::proclaim_return_type<size_type>([num_columns, d_table] __device__(size_type idx) {
-        // First compute the column and the row this item belongs to
-        auto source_row_idx = idx % num_columns;
-        auto source_col_idx = idx / num_columns;
-        return d_table.column(source_row_idx).is_valid(source_col_idx)
-                 ? d_table.column(source_row_idx).element<string_view>(source_col_idx).size_bytes()
-                 : 0;
-      });
-    auto offsets_transformer_itr = thrust::make_transform_iterator(
-      thrust::make_counting_iterator<size_type>(0), offsets_transformer);
-    auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
-      offsets_transformer_itr, offsets_transformer_itr + num_strings, stream, mr);
-    auto d_results_offsets =
-      cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
-
-    // Create the chars column
-    rmm::device_uvector<char> chars(bytes, stream, mr);
-    auto d_results_chars = chars.data();
-    thrust::for_each_n(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator<size_type>(0),
-      num_strings,
-      [num_columns, d_table, d_results_offsets, d_results_chars] __device__(size_type idx) {
-        auto source_row_idx = idx % num_columns;
-        auto source_col_idx = idx / num_columns;
-
-        // Do not write to buffer if the column value for this row is null
-        if (d_table.column(source_row_idx).is_null(source_col_idx)) return;
-
-        size_type offset = d_results_offsets[idx];
-        char* d_buffer   = d_results_chars + offset;
-        strings::detail::copy_string(
-          d_buffer, d_table.column(source_row_idx).element<string_view>(source_col_idx));
-      });
+    rmm::device_uvector<cudf::strings::detail::string_index_pair> indices(num_strings, stream);
+    thrust::transform(rmm::exec_policy_nosync(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(num_strings),
+                      indices.begin(),
+                      interleave_strings_fn{*d_table});
 
-    return make_strings_column(num_strings,
-                               std::move(offsets_column),
-                               chars.release(),
-                               null_count,
-                               std::move(valid_mask.first));
+    return cudf::strings::detail::make_strings_column(indices.begin(), indices.end(), stream, mr);
   }
 };
 

From 2ff60d610847ae4a3a983617f70d2138bf0fd239 Mon Sep 17 00:00:00 2001
From: er-eis <eeisenberg0@gmail.com>
Date: Fri, 3 May 2024 17:47:57 -0400
Subject: [PATCH 162/272] Concatenate dictionary of objects along axis=1
 (#15623)

Note: This work is heavily based off [amanlai's](https://github.com/amanlai) PR [raised here](https://github.com/rapidsai/cudf/pull/15160), wasn't able to base my branch off amanlai's due to deleted branch.

> Closes https://github.com/rapidsai/cudf/issues/15115.
>Unlike `pandas.concat`, `cudf.concat` doesn't work with a dictionary of objects. The following code raises an error.
```python
d = {
    'first': cudf.DataFrame({'A': [1, 2], 'B': [3, 4]}),
    'second': cudf.DataFrame({'A': [5, 6], 'B': [7, 8]}),
}

cudf.concat(d, axis=1)
```
>This commit resolves this issue.

Authors:
  - https://github.com/er-eis
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15623
---
 python/cudf/cudf/core/reshape.py      | 192 +++++++++++++++++---------
 python/cudf/cudf/tests/test_concat.py | 148 ++++++++++++++++++--
 2 files changed, 268 insertions(+), 72 deletions(-)

diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 9008d2f3a1b..26d91bed173 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -122,9 +122,10 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
 
     Parameters
     ----------
-    objs : list of DataFrame, Series, or Index
+    objs : list or dictionary of DataFrame, Series, or Index
     axis : {0/'index', 1/'columns'}, default 0
         The axis to concatenate along.
+        `axis=1` must be passed if a dictionary is passed.
     join : {'inner', 'outer'}, default 'outer'
         How to handle indexes on other axis (or axes).
     ignore_index : bool, default False
@@ -231,27 +232,71 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
       letter  number  animal    name
     0      a       1    bird   polly
     1      b       2  monkey  george
+
+    Combine a dictionary of DataFrame objects horizontally:
+
+    >>> d = {'first': df1, 'second': df2}
+    >>> cudf.concat(d, axis=1)
+      first           second
+      letter  number  letter  number
+    0      a       1       c       3
+    1      b       2       d       4
     """
     # TODO: Do we really need to have different error messages for an empty
     # list and a list of None?
     if not objs:
         raise ValueError("No objects to concatenate")
 
-    objs = [obj for obj in objs if obj is not None]
-
-    if not objs:
-        raise ValueError("All objects passed were None")
-
     axis = _AXIS_MAP.get(axis, None)
     if axis is None:
         raise ValueError(
             f'`axis` must be 0 / "index" or 1 / "columns", got: {axis}'
         )
 
+    if isinstance(objs, dict):
+        if axis != 1:
+            raise NotImplementedError(
+                f"Can only concatenate dictionary input along axis=1, not {axis}"
+            )
+        objs = {k: obj for k, obj in objs.items() if obj is not None}
+        keys = list(objs)
+        objs = list(objs.values())
+        if any(isinstance(o, cudf.BaseIndex) for o in objs):
+            raise TypeError(
+                "cannot concatenate a dictionary containing indices"
+            )
+    else:
+        objs = [obj for obj in objs if obj is not None]
+        keys = None
+
+    if not objs:
+        raise ValueError("All objects passed were None")
+
+    # Retrieve the base types of `objs`. In order to support sub-types
+    # and object wrappers, we use `isinstance()` instead of comparing
+    # types directly
+    allowed_typs = {
+        cudf.Series,
+        cudf.DataFrame,
+        cudf.BaseIndex,
+    }
+    if not all(isinstance(o, tuple(allowed_typs)) for o in objs):
+        raise TypeError(
+            f"can only concatenate objects which are instances of "
+            f"{allowed_typs}, instead received {[type(o) for o in objs]}"
+        )
+
+    if any(isinstance(o, cudf.BaseIndex) for o in objs):
+        if not all(isinstance(o, cudf.BaseIndex) for o in objs):
+            raise TypeError(
+                "when concatenating indices you must provide ONLY indices"
+            )
+
+    only_series = all(isinstance(o, cudf.Series) for o in objs)
+
     # Return for single object
     if len(objs) == 1:
         obj = objs[0]
-
         if ignore_index:
             if axis == 1:
                 result = cudf.DataFrame._from_data(
@@ -290,6 +335,15 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                 result = cudf.DataFrame._from_data(
                     data, index=obj.index.copy(deep=True)
                 )
+                if keys is not None:
+                    if isinstance(result, cudf.DataFrame):
+                        k = keys[0]
+                        result.columns = cudf.MultiIndex.from_tuples(
+                            [
+                                (k, *c) if isinstance(c, tuple) else (k, c)
+                                for c in result._column_names
+                            ]
+                        )
 
         if isinstance(result, cudf.Series) and axis == 0:
             # sort has no effect for series concatted along axis 0
@@ -297,27 +351,9 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
         else:
             return result.sort_index(axis=(1 - axis)) if sort else result
 
-    # Retrieve the base types of `objs`. In order to support sub-types
-    # and object wrappers, we use `isinstance()` instead of comparing
-    # types directly
-    typs = set()
-    for o in objs:
-        if isinstance(o, cudf.MultiIndex):
-            typs.add(cudf.MultiIndex)
-        elif isinstance(o, cudf.BaseIndex):
-            typs.add(type(o))
-        elif isinstance(o, cudf.DataFrame):
-            typs.add(cudf.DataFrame)
-        elif isinstance(o, cudf.Series):
-            typs.add(cudf.Series)
-        else:
-            raise TypeError(f"cannot concatenate object of type {type(o)}")
-
-    allowed_typs = {cudf.Series, cudf.DataFrame}
-
     # when axis is 1 (column) we can concat with Series and Dataframes
     if axis == 1:
-        if not typs.issubset(allowed_typs):
+        if not all(isinstance(o, (cudf.Series, cudf.DataFrame)) for o in objs):
             raise TypeError(
                 "Can only concatenate Series and DataFrame objects when axis=1"
             )
@@ -353,35 +389,71 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
             objs = _align_objs(objs, how=join, sort=sort)
             df.index = objs[0].index
 
-        for o in objs:
-            for name, col in o._data.items():
-                if name in df._data:
-                    raise NotImplementedError(
-                        f"A Column with duplicate name found: {name}, cuDF "
-                        f"doesn't support having multiple columns with "
-                        f"same names yet."
-                    )
-                if empty_inner:
-                    # if join is inner and it contains an empty df
-                    # we return an empty df, hence creating an empty
-                    # column with dtype metadata retained.
-                    df[name] = cudf.core.column.column_empty_like(
-                        col, newsize=0
-                    )
-                else:
-                    df[name] = col
-
-        result_columns = (
-            objs[0]
-            ._data.to_pandas_index()
-            .append([obj._data.to_pandas_index() for obj in objs[1:]])
-        )
+        if keys is None:
+            for o in objs:
+                for name, col in o._data.items():
+                    if name in df._data:
+                        raise NotImplementedError(
+                            f"A Column with duplicate name found: {name}, cuDF "
+                            f"doesn't support having multiple columns with "
+                            f"same names yet."
+                        )
+                    if empty_inner:
+                        # if join is inner and it contains an empty df
+                        # we return an empty df, hence creating an empty
+                        # column with dtype metadata retained.
+                        df[name] = cudf.core.column.column_empty_like(
+                            col, newsize=0
+                        )
+                    else:
+                        df[name] = col
+
+            result_columns = (
+                objs[0]
+                ._data.to_pandas_index()
+                .append([obj._data.to_pandas_index() for obj in objs[1:]])
+                .unique()
+            )
 
-        if ignore_index:
-            # with ignore_index the column names change to numbers
-            df.columns = pd.RangeIndex(len(result_columns.unique()))
+        # need to create a MultiIndex column
         else:
+            # All levels in the multiindex label must have the same type
+            has_multiple_level_types = (
+                len({type(name) for o in objs for name in o._data.keys()}) > 1
+            )
+            if has_multiple_level_types:
+                raise NotImplementedError(
+                    "Cannot construct a MultiIndex column with multiple "
+                    "label types in cuDF at this time. You must convert "
+                    "the labels to the same type."
+                )
+            for k, o in zip(keys, objs):
+                for name, col in o._data.items():
+                    # if only series, then only keep keys as column labels
+                    # if the existing column is multiindex, prepend it
+                    # to handle cases where dfs and srs are concatenated
+                    if only_series:
+                        col_label = k
+                    elif isinstance(name, tuple):
+                        col_label = (k, *name)
+                    else:
+                        col_label = (k, name)
+                    if empty_inner:
+                        df[col_label] = cudf.core.column.column_empty_like(
+                            col, newsize=0
+                        )
+                    else:
+                        df[col_label] = col
+
+        if keys is None:
             df.columns = result_columns.unique()
+            if ignore_index:
+                df.columns = cudf.RangeIndex(len(result_columns.unique()))
+        elif ignore_index:
+            # with ignore_index the column names change to numbers
+            df.columns = cudf.RangeIndex(len(result_columns))
+        elif not only_series:
+            df.columns = cudf.MultiIndex.from_tuples(df._column_names)
 
         if empty_inner:
             # if join is inner and it contains an empty df
@@ -391,18 +463,10 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
         return df
 
     # If we get here, we are always concatenating along axis 0 (the rows).
-    typ = list(typs)[0]
-    if len(typs) > 1:
-        if allowed_typs == typs:
-            # This block of code will run when `objs` has
-            # both Series & DataFrame kind of inputs.
-            _normalize_series_and_dataframe(objs, axis=axis)
-            typ = cudf.DataFrame
-        else:
-            raise TypeError(
-                f"`concat` cannot concatenate objects of "
-                f"types: {sorted([t.__name__ for t in typs])}."
-            )
+    typ = type(objs[0])
+    if len({type(o) for o in objs}) > 1:
+        _normalize_series_and_dataframe(objs, axis=axis)
+        typ = cudf.DataFrame
 
     if typ is cudf.DataFrame:
         old_objs = objs
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index 87b3beb5589..4b43a33c8c8 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -218,7 +218,8 @@ def test_concat_columns(axis):
     assert_eq(expect, got, check_index_type=True)
 
 
-def test_concat_multiindex_dataframe():
+@pytest.mark.parametrize("axis", [0, 1])
+def test_concat_multiindex_dataframe(axis):
     gdf = cudf.DataFrame(
         {
             "w": np.arange(4),
@@ -233,14 +234,11 @@ def test_concat_multiindex_dataframe():
     pdg2 = pdg.iloc[:, 1:]
     gdg1 = cudf.from_pandas(pdg1)
     gdg2 = cudf.from_pandas(pdg2)
+    expected = pd.concat([pdg1, pdg2], axis=axis)
+    result = cudf.concat([gdg1, gdg2], axis=axis)
     assert_eq(
-        cudf.concat([gdg1, gdg2]).astype("float64"),
-        pd.concat([pdg1, pdg2]),
-        check_index_type=True,
-    )
-    assert_eq(
-        cudf.concat([gdg1, gdg2], axis=1),
-        pd.concat([pdg1, pdg2], axis=1),
+        expected,
+        result,
         check_index_type=True,
     )
 
@@ -1865,3 +1863,137 @@ def test_concat_mixed_list_types_error(s1, s2):
 
     with pytest.raises(NotImplementedError):
         cudf.concat([s1, s2], ignore_index=True)
+
+
+@pytest.mark.parametrize(
+    "axis",
+    [
+        pytest.param(
+            0,
+            marks=pytest.mark.xfail(
+                reason="concat dictionaries with axis=0 not implemented"
+            ),
+        ),
+        1,
+        "columns",
+    ],
+)
+@pytest.mark.parametrize(
+    "d",
+    [
+        {"first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}})},
+        {
+            "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}),
+            "second": (cudf.DataFrame, {"data": {"A": [5, 6], "B": [7, 8]}}),
+            "third": (cudf.DataFrame, {"data": {"C": [1, 2, 3]}}),
+        },
+        {
+            "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}),
+            "second": (cudf.DataFrame, {"data": {"C": [1, 2, 3]}}),
+            "third": (cudf.DataFrame, {"data": {"A": [5, 6], "B": [7, 8]}}),
+        },
+        {
+            "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}),
+            "second": (cudf.DataFrame, {"data": {"C": [1, 2, 3]}}),
+            "third": (cudf.DataFrame, {"data": {"A": [5, 6], "C": [7, 8]}}),
+            "fourth": (cudf.DataFrame, {"data": {"B": [9, 10]}}),
+        },
+        pytest.param(
+            {
+                "first": (cudf.DataFrame, {"data": {2.0: [1, 1]}}),
+                "second": (cudf.DataFrame, {"data": {"test": ["abc", "def"]}}),
+            },
+            marks=pytest.mark.xfail(
+                reason=(
+                    "Cannot construct a MultiIndex column with multiple "
+                    "label types in cuDF at this time. You must convert "
+                    "the labels to the same type."
+                )
+            ),
+        ),
+        {
+            "first": (cudf.Series, {"data": [1, 2, 3]}),
+            "second": (cudf.Series, {"data": [4, 5, 6]}),
+        },
+        {
+            "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}),
+            "second": (cudf.Series, {"data": [5, 6], "name": "C"}),
+        },
+        pytest.param(
+            {
+                "first": (
+                    cudf.DataFrame,
+                    {"data": {("A", "B"): [1, 2], "C": [3, 4]}},
+                ),
+                "second": (
+                    cudf.DataFrame,
+                    {"data": {"D": [5, 6], ("A", "B"): [7, 8]}},
+                ),
+            },
+            marks=pytest.mark.xfail(
+                reason=(
+                    "Cannot construct a MultiIndex column with multiple "
+                    "label types in cuDF at this time. You must convert "
+                    "the labels to the same type."
+                )
+            ),
+        ),
+        pytest.param(
+            {
+                "first": (
+                    cudf.DataFrame,
+                    {"data": {("A", "B"): [3, 4], 2.0: [1, 1]}},
+                ),
+                "second": (
+                    cudf.DataFrame,
+                    {"data": {("C", "D"): [3, 4], 3.0: [5, 6]}},
+                ),
+            },
+            marks=pytest.mark.xfail(
+                reason=(
+                    "Cannot construct a MultiIndex column with multiple "
+                    "label types in cuDF at this time. You must convert "
+                    "the labels to the same type."
+                )
+            ),
+        ),
+        {
+            "first": (
+                cudf.DataFrame,
+                {"data": {(1, 2): [1, 2], (3, 4): [3, 4]}},
+            ),
+            "second": (
+                cudf.DataFrame,
+                {"data": {(1, 2): [5, 6], (5, 6): [7, 8]}},
+            ),
+        },
+    ],
+)
+def test_concat_dictionary(d, axis):
+    _dict = {k: c(**v) for k, (c, v) in d.items()}
+    result = cudf.concat(_dict, axis=axis)
+    expected = cudf.from_pandas(
+        pd.concat({k: df.to_pandas() for k, df in _dict.items()}, axis=axis)
+    )
+    assert_eq(expected, result)
+
+
+@pytest.mark.parametrize(
+    "d",
+    [
+        {"first": cudf.Index([1, 2, 3])},
+        {
+            "first": cudf.MultiIndex(
+                levels=[[1, 2], ["blue", "red"]],
+                codes=[[0, 0, 1, 1], [1, 0, 1, 0]],
+            )
+        },
+        {"first": cudf.CategoricalIndex([1, 2, 3])},
+    ],
+)
+def test_concat_dict_incorrect_type_index(d):
+    with pytest.raises(
+        TypeError,
+        match="cannot concatenate a dictionary containing indices",
+    ):
+        cudf.concat(d, axis=1)

From bee2a38b63fb5e4ef90f243a3c51cf23fbf3c984 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 3 May 2024 11:58:14 -1000
Subject: [PATCH 163/272] Enable FutureWarnings/DeprecationWarnings as errors
 for dask_cudf (#15634)

Part of https://github.com/rapidsai/build-planning/issues/26

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15634
---
 python/cudf/cudf/core/index.py                 | 18 +++++++++++++++---
 python/cudf/cudf/tests/test_index.py           | 12 ++++++++----
 .../dask_cudf/dask_cudf/io/tests/test_json.py  |  5 ++---
 .../dask_cudf/dask_cudf/tests/test_accessor.py |  6 +++---
 .../dask_cudf/dask_cudf/tests/test_groupby.py  | 10 +++++-----
 python/dask_cudf/dask_cudf/tests/test_join.py  | 16 ++++++++++++----
 python/dask_cudf/pyproject.toml                | 10 ++++++++++
 7 files changed, 55 insertions(+), 22 deletions(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 583e5d74b56..b51751a1b55 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1119,14 +1119,26 @@ def _concat(cls, objs):
             assert (
                 PANDAS_LT_300
             ), "Need to drop after pandas-3.0 support is added."
-            warnings.warn(
+            warning_msg = (
                 "The behavior of array concatenation with empty entries is "
                 "deprecated. In a future version, this will no longer exclude "
                 "empty items when determining the result dtype. "
                 "To retain the old behavior, exclude the empty entries before "
-                "the concat operation.",
-                FutureWarning,
+                "the concat operation."
             )
+            # Warn only if the type might _actually_ change
+            if len(non_empties) == 0:
+                if not all(objs[0].dtype == index.dtype for index in objs[1:]):
+                    warnings.warn(warning_msg, FutureWarning)
+            else:
+                common_all_type = find_common_type(
+                    [index.dtype for index in objs]
+                )
+                common_non_empty_type = find_common_type(
+                    [index.dtype for index in non_empties]
+                )
+                if common_all_type != common_non_empty_type:
+                    warnings.warn(warning_msg, FutureWarning)
         if all(isinstance(obj, RangeIndex) for obj in non_empties):
             result = _concat_range_index(non_empties)
         else:
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index c7875b81440..104a5fc0ffa 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -1039,7 +1039,9 @@ def test_index_append(data, other):
         (len(data) == 0 or len(other) == 0) and pd_data.dtype != pd_other.dtype
     ):
         expected = pd_data.append(pd_other)
-    with expect_warning_if(len(data) == 0 or len(other) == 0):
+    with expect_warning_if(
+        (len(data) == 0 or len(other) == 0) and gd_data.dtype != gd_other.dtype
+    ):
         actual = gd_data.append(gd_other)
     if len(data) == 0 and len(other) == 0:
         # Pandas default dtype to "object" for empty list
@@ -1237,7 +1239,10 @@ def test_index_append_list(data, other):
         and (any(d.dtype != data.dtype for d in other))
     ):
         expected = pd_data.append(pd_other)
-    with expect_warning_if(len(data) == 0 or any(len(d) == 0 for d in other)):
+    with expect_warning_if(
+        (len(data) == 0 or any(len(d) == 0 for d in other))
+        and (any(d.dtype != data.dtype for d in other))
+    ):
         actual = gd_data.append(gd_other)
 
     assert_eq(expected, actual)
@@ -2817,8 +2822,7 @@ def test_index_methods(index, func):
 
     if func == "append":
         expected = pidx.append(other=pidx)
-        with expect_warning_if(len(gidx) == 0):
-            actual = gidx.append(other=gidx)
+        actual = gidx.append(other=gidx)
     else:
         expected = getattr(pidx, func)()
         actual = getattr(gidx, func)()
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py
index f8e5be0a417..dc780478794 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_json.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py
@@ -84,9 +84,8 @@ def test_read_json_nested(tmp_path):
         }
     )
     kwargs = dict(orient="records", lines=True)
-    with tmp_path / "data.json" as f, dask.config.set(
-        {"dataframe.convert-string": False}
-    ):
+    f = tmp_path / "data.json"
+    with dask.config.set({"dataframe.convert-string": False}):
         df.to_json(f, **kwargs)
         # Ensure engine='cudf' is tested.
         actual = dask_cudf.read_json(f, engine="cudf", **kwargs)
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index ae17b89832a..035b73094e7 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -543,7 +543,7 @@ def test_struct_explode(data):
 
 
 def test_tz_localize():
-    data = Series(date_range("2000-04-01", "2000-04-03", freq="H"))
+    data = Series(date_range("2000-04-01", "2000-04-03", freq="h"))
     expect = data.dt.tz_localize(
         "US/Eastern", ambiguous="NaT", nonexistent="NaT"
     )
@@ -560,8 +560,8 @@ def test_tz_localize():
 @pytest.mark.parametrize(
     "data",
     [
-        date_range("2000-04-01", "2000-04-03", freq="H").tz_localize("UTC"),
-        date_range("2000-04-01", "2000-04-03", freq="H").tz_localize(
+        date_range("2000-04-01", "2000-04-03", freq="h").tz_localize("UTC"),
+        date_range("2000-04-01", "2000-04-03", freq="h").tz_localize(
             "US/Eastern"
         ),
     ],
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index 67fa045d3d0..f96b5b760d8 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -233,7 +233,7 @@ def test_groupby_split_out(split_out, column):
     gddf = dask_cudf.from_cudf(gdf, npartitions=3)
 
     ddf_result = (
-        ddf.groupby(column)
+        ddf.groupby(column, observed=True)
         .a.mean(split_out=split_out)
         .compute()
         .sort_values()
@@ -368,10 +368,10 @@ def test_groupby_dropna_dask(dropna, by):
 
     if dropna is None:
         dask_cudf_result = gddf.groupby(by).e.sum()
-        dask_result = ddf.groupby(by).e.sum()
+        dask_result = ddf.groupby(by, observed=True).e.sum()
     else:
         dask_cudf_result = gddf.groupby(by, dropna=dropna).e.sum()
-        dask_result = ddf.groupby(by, dropna=dropna).e.sum()
+        dask_result = ddf.groupby(by, dropna=dropna, observed=True).e.sum()
 
     dd.assert_eq(dask_cudf_result, dask_result)
 
@@ -505,7 +505,7 @@ def test_groupby_reset_index_dtype():
     a = df.groupby("a").agg({"b": ["count"]})
 
     assert a.index.dtype == "int8"
-    assert a.reset_index().dtypes[0] == "int8"
+    assert a.reset_index().dtypes.iloc[0] == "int8"
 
 
 def test_groupby_reset_index_names():
@@ -563,7 +563,7 @@ def test_groupby_categorical_key():
     # (See: https://github.com/dask/dask/issues/9515)
     expect = (
         ddf.compute()
-        .groupby("name", sort=True)
+        .groupby("name", sort=True, observed=True)
         .agg({"x": ["mean", "max"], "y": ["mean", "count"]})
     )
     dd.assert_eq(expect, got)
diff --git a/python/dask_cudf/dask_cudf/tests/test_join.py b/python/dask_cudf/dask_cudf/tests/test_join.py
index 42ecc130298..ed291ef31a7 100644
--- a/python/dask_cudf/dask_cudf/tests/test_join.py
+++ b/python/dask_cudf/dask_cudf/tests/test_join.py
@@ -66,8 +66,12 @@ def test_join_inner(left_nrows, right_nrows, left_nkeys, right_nkeys):
     def gather(df, grows):
         grows[df["x"].values[0]] = (set(df.al), set(df.ar))
 
-    expect.reset_index().groupby("x").apply(partial(gather, grows=expect_rows))
-    expect.reset_index().groupby("x").apply(partial(gather, grows=got_rows))
+    expect.reset_index().groupby("x")[["x", "al", "ar"]].apply(
+        partial(gather, grows=expect_rows)
+    )
+    expect.reset_index().groupby("x")[["x", "al", "ar"]].apply(
+        partial(gather, grows=got_rows)
+    )
 
     assert got_rows == expect_rows
 
@@ -127,9 +131,13 @@ def gather(df, grows):
 
         grows[df["x"].values[0]] = (cola, colb)
 
-    expect.reset_index().groupby("x").apply(partial(gather, grows=expect_rows))
+    expect.reset_index().groupby("x")[["x", "al", "ar"]].apply(
+        partial(gather, grows=expect_rows)
+    )
 
-    expect.reset_index().groupby("x").apply(partial(gather, grows=got_rows))
+    expect.reset_index().groupby("x")[["x", "al", "ar"]].apply(
+        partial(gather, grows=got_rows)
+    )
 
     for k in expect_rows:
         np.testing.assert_array_equal(expect_rows[k][0], got_rows[k][0])
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index fcf83e82989..5fbdd98225e 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -107,3 +107,13 @@ skip = [
     "build",
     "dist",
 ]
+
+[tool.pytest.ini_options]
+filterwarnings = [
+    "error::FutureWarning",
+    "error::DeprecationWarning",
+    "ignore:create_block_manager_from_blocks is deprecated and will be removed in a future version. Use public APIs instead.:DeprecationWarning",
+    # https://github.com/dask/partd/blob/main/partd/pandas.py#L198
+    "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning",
+    "ignore:String support for `aggregate_files` is experimental. Behavior may change in the future.:FutureWarning:dask",
+]

From 23bb2ed156d164b59e608e7e791c74db5cb4bce8 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 3 May 2024 17:08:11 -1000
Subject: [PATCH 164/272] Enable warnings as errors in custreamz (#15642)

Part of https://github.com/rapidsai/build-planning/issues/26

Builds on https://github.com/rapidsai/cudf/pull/15634

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15642
---
 python/custreamz/pyproject.toml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index e6c86351ac9..7786bf98bef 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -103,3 +103,13 @@ skip = [
     "dist",
     "__init__.py",
 ]
+
+[tool.pytest.ini_options]
+filterwarnings = [
+    "error",
+    "ignore:unclosed <socket.socket:ResourceWarning",
+    "ignore:Port .* is already in use.:UserWarning:distributed",
+    # Should be fixed in the next streamz release
+    # https://github.com/python-streamz/streamz/commit/2812f1f961dfcb3f17e948d8b12a12472975558e
+    "ignore:pkg_resources is deprecated as an API:DeprecationWarning:streamz",
+]

From d3c4cf44940b6d60131bc72241c749021e4a8117 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <phcho@nvidia.com>
Date: Mon, 6 May 2024 10:39:07 -0700
Subject: [PATCH 165/272] Migrate to `{{ stdlib("c") }}` (#15594)

The `sysroot*` syntax is getting phased out (conda-forge/conda-forge.github.io#2102).
The recommendation is to move to `{{ stdlib("c") }}`.

Ref https://github.com/rapidsai/build-planning/issues/39

Authors:
  - Philip Hyunsu Cho (https://github.com/hcho3)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - https://github.com/jakirkham
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15594
---
 conda/recipes/cudf/conda_build_config.yaml       | 5 ++++-
 conda/recipes/cudf/meta.yaml                     | 2 +-
 conda/recipes/cudf_kafka/conda_build_config.yaml | 5 ++++-
 conda/recipes/cudf_kafka/meta.yaml               | 2 +-
 conda/recipes/libcudf/conda_build_config.yaml    | 5 ++++-
 conda/recipes/libcudf/meta.yaml                  | 4 ++--
 6 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/conda/recipes/cudf/conda_build_config.yaml b/conda/recipes/cudf/conda_build_config.yaml
index c98c2701653..d399e440edd 100644
--- a/conda/recipes/cudf/conda_build_config.yaml
+++ b/conda/recipes/cudf/conda_build_config.yaml
@@ -4,7 +4,10 @@ c_compiler_version:
 cxx_compiler_version:
   - 11
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
   - "2.17"
 
 cmake_version:
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index ae2d938250b..ddcadfd1570 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -57,7 +57,7 @@ requirements:
     - {{ compiler('cuda') }}
     {% endif %}
     - cuda-version ={{ cuda_version }}
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
   host:
     - python
     - cython >=3.0.3
diff --git a/conda/recipes/cudf_kafka/conda_build_config.yaml b/conda/recipes/cudf_kafka/conda_build_config.yaml
index c98c2701653..d399e440edd 100644
--- a/conda/recipes/cudf_kafka/conda_build_config.yaml
+++ b/conda/recipes/cudf_kafka/conda_build_config.yaml
@@ -4,7 +4,10 @@ c_compiler_version:
 cxx_compiler_version:
   - 11
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
   - "2.17"
 
 cmake_version:
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index 45e41bf8de7..ab41d9e1f15 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -53,7 +53,7 @@ requirements:
     - {{ compiler('cuda') }}
     {% endif %}
     - cuda-version ={{ cuda_version }}
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
   host:
     - python
     - cython >=3.0.3
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index b7fbaab9306..ba5e96fb6cf 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -10,7 +10,10 @@ cuda_compiler:
 cuda11_compiler:
   - nvcc
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
   - "2.17"
 
 cmake_version:
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 695c515b9d4..76115362b6c 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -43,7 +43,7 @@ requirements:
     {% endif %}
     - cuda-version ={{ cuda_version }}
     - ninja
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
   host:
     - librmm ={{ minor_version }}
     - libkvikio ={{ minor_version }}
@@ -170,7 +170,7 @@ outputs:
         {% endif %}
         - cuda-version ={{ cuda_version }}
         - ninja
-        - sysroot_{{ target_platform }} {{ sysroot_version }}
+        - {{ stdlib("c") }}
       host:
         - {{ pin_subpackage('libcudf', exact=True) }}
         {% if cuda_major == "11" %}

From 4dc616227f5c872031603426a3235282f6d23554 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Mon, 6 May 2024 11:29:46 -0700
Subject: [PATCH 166/272] Implement JNI for chunked ORC reader (#15446)

This adds JNI implementation for chunked ORC reader, allowing to read ORC files by an iterative manner.

Depends on:
 * https://github.com/rapidsai/cudf/pull/15094

Closes https://github.com/rapidsai/cudf/issues/12228.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/15446
---
 .../java/ai/rapids/cudf/ORCChunkedReader.java | 169 +++++++++++++++++
 java/src/main/native/src/ChunkedReaderJni.cpp | 173 ++++++++++++++++--
 .../test/java/ai/rapids/cudf/TableTest.java   |  24 +++
 java/src/test/resources/splittable.orc        | Bin 0 -> 385961 bytes
 4 files changed, 352 insertions(+), 14 deletions(-)
 create mode 100644 java/src/main/java/ai/rapids/cudf/ORCChunkedReader.java
 create mode 100644 java/src/test/resources/splittable.orc

diff --git a/java/src/main/java/ai/rapids/cudf/ORCChunkedReader.java b/java/src/main/java/ai/rapids/cudf/ORCChunkedReader.java
new file mode 100644
index 00000000000..2f46c8d1825
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/ORCChunkedReader.java
@@ -0,0 +1,169 @@
+/*
+ *
+ *  Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * Provide an interface for reading an ORC file in an iterative manner.
+ */
+public class ORCChunkedReader implements AutoCloseable {
+  static {
+    NativeDepsLoader.loadNativeDeps();
+  }
+
+  /**
+   * Construct the reader instance from read limits, output row granularity,
+   * and a file already loaded in a memory buffer.
+   *
+   * @param chunkReadLimit Limit on total number of bytes to be returned per read,
+   *                       or 0 if there is no limit.
+   * @param passReadLimit  Limit on the amount of memory used by the chunked reader,
+   *                       or 0 if there is no limit.
+   * @param opts           The options for ORC reading.
+   * @param buffer         Raw ORC file content.
+   * @param offset         The starting offset into buffer.
+   * @param len            The number of bytes to parse the given buffer.
+   */
+  public ORCChunkedReader(long chunkReadLimit, long passReadLimit,
+      ORCOptions opts, HostMemoryBuffer buffer, long offset, long len) {
+    handle = createReader(chunkReadLimit, passReadLimit,
+        opts.getIncludeColumnNames(), buffer.getAddress() + offset, len,
+        opts.usingNumPyTypes(), opts.timeUnit().typeId.getNativeId(),
+        opts.getDecimal128Columns());
+    if (handle == 0) {
+      throw new IllegalStateException("Cannot create native chunked ORC reader object.");
+    }
+  }
+
+  /**
+   * Construct a chunked ORC reader instance, similar to
+   * {@link ORCChunkedReader#ORCChunkedReader(long, long, ORCOptions, HostMemoryBuffer, long, long)},
+   * with an additional parameter to control the granularity of the output table.
+   * When reading a chunk table, with respect to the given size limits, a subset of stripes may
+   * be loaded, decompressed and decoded into a large intermediate table. The reader will then
+   * subdivide that table into smaller tables for final output using
+   * {@code outputRowSizingGranularity} as the subdivision step. If the chunked reader is
+   * constructed without this parameter, the default value of 10k rows will be used.
+   *
+   * @param outputRowSizingGranularity The change step in number of rows in the output table.
+   * @see ORCChunkedReader#ORCChunkedReader(long, long, ORCOptions, HostMemoryBuffer, long, long)
+   */
+  public ORCChunkedReader(long chunkReadLimit, long passReadLimit, long outputRowSizingGranularity,
+      ORCOptions opts, HostMemoryBuffer buffer, long offset, long len) {
+    handle = createReaderWithOutputGranularity(chunkReadLimit, passReadLimit, outputRowSizingGranularity,
+        opts.getIncludeColumnNames(), buffer.getAddress() + offset, len,
+        opts.usingNumPyTypes(), opts.timeUnit().typeId.getNativeId(),
+        opts.getDecimal128Columns());
+    if (handle == 0) {
+      throw new IllegalStateException("Cannot create native chunked ORC reader object.");
+    }
+  }
+
+  /**
+   * Check if the given file has anything left to read.
+   *
+   * @return A boolean value indicating if there is more data to read from file.
+   */
+  public boolean hasNext() {
+    if (handle == 0) {
+      throw new IllegalStateException("Native chunked ORC reader object may have been closed.");
+    }
+
+    if (firstCall) {
+      // This function needs to return true at least once, so an empty table
+      // (but having empty columns instead of no column) can be returned by readChunk()
+      // if the input file has no row.
+      firstCall = false;
+      return true;
+    }
+    return hasNext(handle);
+  }
+
+  /**
+   * Read a chunk of rows in the given ORC file such that the returning data has total size
+   * does not exceed the given read limit. If the given file has no data, or all data has been read
+   * before by previous calls to this function, a null Table will be returned.
+   *
+   * @return A table of new rows reading from the given file.
+   */
+  public Table readChunk() {
+    if (handle == 0) {
+      throw new IllegalStateException("Native chunked ORC reader object may have been closed.");
+    }
+
+    long[] columnPtrs = readChunk(handle);
+    return columnPtrs != null ? new Table(columnPtrs) : null;
+  }
+
+  @Override
+  public void close() {
+    if (handle != 0) {
+      close(handle);
+      handle = 0;
+    }
+  }
+
+
+  /**
+   * Auxiliary variable to help {@link #hasNext()} returning true at least once.
+   */
+  private boolean firstCall = true;
+
+  /**
+   * Handle for memory address of the native ORC chunked reader class.
+   */
+  private long handle;
+
+  /**
+   * Create a native chunked ORC reader object on heap and return its memory address.
+   *
+   * @param chunkReadLimit    Limit on total number of bytes to be returned per read,
+   *                          or 0 if there is no limit.
+   * @param passReadLimit     Limit on the amount of memory used by the chunked reader,
+   *                          or 0 if there is no limit.
+   * @param filterColumnNames Name of the columns to read, or an empty array if we want to read all.
+   * @param bufferAddrs       The address of a buffer to read from, or 0 if we are not using that buffer.
+   * @param length            The length of the buffer to read from.
+   * @param usingNumPyTypes   Whether the parser should implicitly promote TIMESTAMP
+   *                          columns to TIMESTAMP_MILLISECONDS for compatibility with NumPy.
+   * @param timeUnit          return type of TimeStamp in units
+   * @param decimal128Columns name of the columns which are read as Decimal128 rather than Decimal64
+   */
+  private static native long createReader(long chunkReadLimit, long passReadLimit,
+      String[] filterColumnNames, long bufferAddrs, long length,
+      boolean usingNumPyTypes, int timeUnit, String[] decimal128Columns);
+
+  /**
+   * Create a native chunked ORC reader object, similar to
+   * {@link ORCChunkedReader#createReader(long, long, String[], long, long, boolean, int, String[])},
+   * with an additional parameter to control the granularity of the output table.
+   *
+   * @param outputRowSizingGranularity The change step in number of rows in the output table.
+   * @see ORCChunkedReader#createReader(long, long, String[], long, long, boolean, int, String[])
+   */
+  private static native long createReaderWithOutputGranularity(
+      long chunkReadLimit, long passReadLimit, long outputRowSizingGranularity,
+      String[] filterColumnNames, long bufferAddrs, long length,
+      boolean usingNumPyTypes, int timeUnit, String[] decimal128Columns);
+
+  private static native boolean hasNext(long handle);
+
+  private static native long[] readChunk(long handle);
+
+  private static native void close(long handle);
+}
diff --git a/java/src/main/native/src/ChunkedReaderJni.cpp b/java/src/main/native/src/ChunkedReaderJni.cpp
index 7681008f584..cf04a87262f 100644
--- a/java/src/main/native/src/ChunkedReaderJni.cpp
+++ b/java/src/main/native/src/ChunkedReaderJni.cpp
@@ -18,22 +18,22 @@
 #include "jni_utils.hpp"
 
 #include <cudf/column/column.hpp>
+#include <cudf/io/orc.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/table/table.hpp>
 
 #include <memory>
+#include <optional>
 #include <vector>
 
-// This function is defined in `TableJni.cpp`.
-jlongArray cudf::jni::convert_table_for_return(
-  JNIEnv* env,
-  std::unique_ptr<cudf::table>&& table_result,
-  std::vector<std::unique_ptr<cudf::column>>&& extra_columns);
-
 // This file is for the code related to chunked reader (Parquet, ORC, etc.).
 
 extern "C" {
 
+//
+// Chunked Parquet reader JNI
+//
+
 // This function should take all the parameters that `Table.readParquet` takes,
 // plus one more parameter `long chunkSizeByteLimit`.
 JNIEXPORT jlong JNICALL
@@ -54,19 +54,17 @@ Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env,
     JNI_NULL_CHECK(env, inp_file_path, "Input file or buffer must be supplied", 0);
     read_buffer = false;
   } else if (inp_file_path != nullptr) {
-    JNI_THROW_NEW(env,
-                  "java/lang/IllegalArgumentException",
-                  "Cannot pass in both a buffer and an inp_file_path",
-                  0);
+    JNI_THROW_NEW(
+      env, cudf::jni::ILLEGAL_ARG_CLASS, "Cannot pass in both a buffer and an inp_file_path", 0);
   } else if (buffer_length <= 0) {
-    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "An empty buffer is not supported", 0);
+    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", 0);
   }
 
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jstring filename(env, inp_file_path);
     if (!read_buffer && filename.is_empty()) {
-      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "inp_file_path cannot be empty", 0);
+      JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "inp_file_path cannot be empty", 0);
     }
 
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
@@ -155,7 +153,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_readChunk(
                                                                                 jclass,
                                                                                 jlong handle)
 {
-  JNI_NULL_CHECK(env, handle, "handle is null", 0);
+  JNI_NULL_CHECK(env, handle, "handle is null", nullptr);
 
   try {
     cudf::jni::auto_set_device(env);
@@ -163,7 +161,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_readChunk(
     auto chunk            = reader_ptr->read_chunk();
     return chunk.tbl ? cudf::jni::convert_table_for_return(env, chunk.tbl) : nullptr;
   }
-  CATCH_STD(env, 0);
+  CATCH_STD(env, nullptr);
 }
 
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_close(JNIEnv* env,
@@ -179,4 +177,151 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_close(JNIEnv* en
   CATCH_STD(env, );
 }
 
+//
+// Chunked ORC reader JNI
+//
+
+namespace {
+jlong create_chunked_orc_reader(JNIEnv* env,
+                                jlong chunk_read_limit,
+                                jlong pass_read_limit,
+                                std::optional<jlong> output_granularity,
+                                jobjectArray filter_col_names,
+                                jlong buffer,
+                                jlong buffer_length,
+                                jboolean using_numpy_Types,
+                                jint unit,
+                                jobjectArray dec128_col_names)
+{
+  JNI_NULL_CHECK(env, buffer, "buffer is null", 0);
+  if (buffer_length <= 0) {
+    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", 0);
+  }
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
+    cudf::jni::native_jstringArray n_dec128_col_names(env, dec128_col_names);
+
+    auto const source = cudf::io::source_info(reinterpret_cast<char*>(buffer),
+                                              static_cast<std::size_t>(buffer_length));
+    auto opts_builder = cudf::io::orc_reader_options::builder(source);
+    if (n_filter_col_names.size() > 0) {
+      opts_builder = opts_builder.columns(n_filter_col_names.as_cpp_vector());
+    }
+    auto const read_opts = opts_builder.use_index(false)
+                             .use_np_dtypes(static_cast<bool>(using_numpy_Types))
+                             .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+                             .decimal128_columns(n_dec128_col_names.as_cpp_vector())
+                             .build();
+
+    if (output_granularity) {
+      return reinterpret_cast<jlong>(
+        new cudf::io::chunked_orc_reader(static_cast<std::size_t>(chunk_read_limit),
+                                         static_cast<std::size_t>(pass_read_limit),
+                                         static_cast<std::size_t>(output_granularity.value()),
+                                         read_opts));
+    }
+    return reinterpret_cast<jlong>(
+      new cudf::io::chunked_orc_reader(static_cast<std::size_t>(chunk_read_limit),
+                                       static_cast<std::size_t>(pass_read_limit),
+                                       read_opts));
+  }
+  CATCH_STD(env, 0);
+}
+}  // namespace
+
+// This function should take all the parameters that `Table.readORC` takes,
+// plus two more parameters: `chunk_read_limit` and `pass_read_limit`.
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ORCChunkedReader_createReader(JNIEnv* env,
+                                                  jclass,
+                                                  jlong chunk_read_limit,
+                                                  jlong pass_read_limit,
+                                                  jobjectArray filter_col_names,
+                                                  jlong buffer,
+                                                  jlong buffer_length,
+                                                  jboolean using_numpy_Types,
+                                                  jint unit,
+                                                  jobjectArray dec128_col_names)
+{
+  return create_chunked_orc_reader(env,
+                                   chunk_read_limit,
+                                   pass_read_limit,
+                                   std::nullopt,
+                                   filter_col_names,
+                                   buffer,
+                                   buffer_length,
+                                   using_numpy_Types,
+                                   unit,
+                                   dec128_col_names);
+}
+
+// This function should take all the parameters that `Table.readORC` takes,
+// plus three more parameters: `chunk_read_limit`, `pass_read_limit`, `output_granularity`.
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ORCChunkedReader_createReaderWithOutputGranularity(
+  JNIEnv* env,
+  jclass,
+  jlong chunk_read_limit,
+  jlong pass_read_limit,
+  jlong output_granularity,
+  jobjectArray filter_col_names,
+  jlong buffer,
+  jlong buffer_length,
+  jboolean using_numpy_Types,
+  jint unit,
+  jobjectArray dec128_col_names)
+{
+  return create_chunked_orc_reader(env,
+                                   chunk_read_limit,
+                                   pass_read_limit,
+                                   output_granularity,
+                                   filter_col_names,
+                                   buffer,
+                                   buffer_length,
+                                   using_numpy_Types,
+                                   unit,
+                                   dec128_col_names);
+}
+
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ORCChunkedReader_hasNext(JNIEnv* env,
+                                                                        jclass,
+                                                                        jlong handle)
+{
+  JNI_NULL_CHECK(env, handle, "handle is null", false);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const reader_ptr = reinterpret_cast<cudf::io::chunked_orc_reader* const>(handle);
+    return reader_ptr->has_next();
+  }
+  CATCH_STD(env, false);
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ORCChunkedReader_readChunk(JNIEnv* env,
+                                                                            jclass,
+                                                                            jlong handle)
+{
+  JNI_NULL_CHECK(env, handle, "handle is null", nullptr);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const reader_ptr = reinterpret_cast<cudf::io::chunked_orc_reader* const>(handle);
+    auto chunk            = reader_ptr->read_chunk();
+    return chunk.tbl ? cudf::jni::convert_table_for_return(env, chunk.tbl) : nullptr;
+  }
+  CATCH_STD(env, nullptr);
+}
+
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ORCChunkedReader_close(JNIEnv* env, jclass, jlong handle)
+{
+  JNI_NULL_CHECK(env, handle, "handle is null", );
+
+  try {
+    cudf::jni::auto_set_device(env);
+    delete reinterpret_cast<cudf::io::chunked_orc_reader*>(handle);
+  }
+  CATCH_STD(env, );
+}
+
 }  // extern "C"
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 8560a9caad7..dc6eb55fc6a 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -81,6 +81,7 @@ public class TableTest extends CudfTestBase {
   private static final File TEST_PARQUET_FILE_CHUNKED_READ = TestUtils.getResourceAsFile("splittable.parquet");
   private static final File TEST_PARQUET_FILE_BINARY = TestUtils.getResourceAsFile("binary.parquet");
   private static final File TEST_ORC_FILE = TestUtils.getResourceAsFile("TestOrcFile.orc");
+  private static final File TEST_ORC_FILE_CHUNKED_READ = TestUtils.getResourceAsFile("splittable.orc");
   private static final File TEST_ORC_TIMESTAMP_DATE_FILE = TestUtils.getResourceAsFile("timestamp-date-test.orc");
   private static final File TEST_DECIMAL_PARQUET_FILE = TestUtils.getResourceAsFile("decimal.parquet");
   private static final File TEST_ALL_TYPES_PLAIN_AVRO_FILE = TestUtils.getResourceAsFile("alltypes_plain.avro");
@@ -1699,6 +1700,29 @@ void testReadORCTimeUnit() {
     }
   }
 
+  @Test
+  void testORCChunkedReader() throws IOException {
+    byte[] buffer = Files.readAllBytes(TEST_ORC_FILE_CHUNKED_READ.toPath());
+    long len = buffer.length;
+
+    try (HostMemoryBuffer hostBuf = hostMemoryAllocator.allocate(len)) {
+      hostBuf.setBytes(0, buffer, 0, len);
+      try (ORCChunkedReader reader = new ORCChunkedReader(0, 2 * 1024 * 1024, 10000,
+          ORCOptions.DEFAULT, hostBuf, 0, len)) {
+        int numChunks = 0;
+        long totalRows = 0;
+        while (reader.hasNext()) {
+          ++numChunks;
+          try (Table chunk = reader.readChunk()) {
+            totalRows += chunk.getRowCount();
+          }
+        }
+        assertEquals(10, numChunks);
+        assertEquals(1000000, totalRows);
+      }
+    }
+  }
+
   @Test
   void testCrossJoin() {
     try (Table leftTable = new Table.TestBuilder()
diff --git a/java/src/test/resources/splittable.orc b/java/src/test/resources/splittable.orc
new file mode 100644
index 0000000000000000000000000000000000000000..1f5e094534f2a550cab263dbba58b9cb473c1821
GIT binary patch
literal 385961
zcmeI%J!qT-7zgn8E?;l1X_K5^NxDet5ClU;i4-nmacV+02XWA`;OKe{4z`0vMTHtc
zZ5LY$R$Fwi)xl)ZDky$JZFOk7s4Z5|D#c5R*!KnD86WT$n%?E%f9b>T2}#O-Y475A
zQM8(^t@q=-r!%c)QM9Ma)ji9_)Xw2z>ut8vOKrZ*b|yC;x=YQ4*1fI%qI<PJl07+J
zbbsu9u7O7XY-eDoC~lO+jgCJA2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5Fqfc
zz(%K7?6tOo<=|K`v2E+IG4W7qsBDZiCeF>bn`N>2uH;oW6Cgl<009C72oNAZfB*pk
z1PBlyK!5-N0t5&UAV464fRZ=E73@HO009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs
zfuMkrHwYz0fB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+0D%kwO5O}tumb@C1PBly
zK!5-N0t5&UAV7cs0RjXF5FkK+009C7f&xn3Ae0yZ0t5&UAV7cs0RjXF5FkK+009C7
z2oNAZfB*pk1TqLHc{5zW4g?4gAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBla3MhGl
zP+|lK5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U$RMEP&2R-f5FkK+009C72oNAZ
zfB*pk1PBlyK!5-N0t5&UAV44}pyUlgi4h<`fB*pk1PBlyK!5-N0t5&UAV7cs0RjXF
z5FkJxgMgAZ!xij6fB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+0D+)@k~at?Mt}eT
z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB=CE0!rQtSFi&C0t5&UAV7cs0RjXF5FkK+
z009C72oNAZfB*pk1cCxe-XN410RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t7M$
zD0wqn!43on5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U2nr~9gHU1w2oNAZfB*pk
z1PBlyK!5-N0t5&UAV7cs0RjXF5Xc~)<jrseI}jj1fB*pk1PBlyK!5-N0t5&UAV7cs
z0RjXF5FkJxD4^sGLWvO|K!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZAcKICH^UX|
zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB=D@fRZ-|B}RY%0RjXF5FkK+009C7
z2oNAZfB*pk1PBlyK!5;&3<65t3|FuN0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N
z0tA8rO5Pxp7y$wV2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXv2q<|oT)_?m2oNAZ
zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5C{q=d4o`51PBlyK!5-N0t5&UAV7cs0RjXF
z5FkK+009C72oT61pybVP1v?NRK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZASj^Z
z4MK?#AV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyKp=yFk~hN@>_C730RjXF5FkK+
z009C72oNAZfB*pk1PBlyK!5;&pn#G$2qi{<009C72oNAZfB*pk1PBlyK!5-N0t5&U
zAV7csfeZpl-V9f;0|5dA2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjYq0!rQ>lo$a5
z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7G6*PnGhD$A1PBlyK!5-N0t5&UAV7cs
z0RjXF5FkK+009C72oMMgD0zcWVgv{fAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBnw
zAfV*Ua0NRMAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyKp-ff<PAcJ5g<T-009C7
z2oNAZfB*pk1PBlyK!5-N0t5&UAV464fRZ=E73@HO009C72oNAZfB*pk1PBlyK!5-N
z0t5&UAV7csfuMkrHwYz0fB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+0D%kwO5O}t
zumb@C1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7f&xn3Ae0yZ0t5&UAV7cs0RjXF
z5FkK+009C72oNAZfB*pk1TqLHc{5zW4g?4gAV7cs0RjXF5FkK+009C72oNAZfB*pk
z1PBla3MhGlP+|lK5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U$RMEP&2R-f5FkK+
z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV44}pyUlgi4h<`fB*pk1PBlyK!5-N0t5&U
zAV7cs0RjXF5FkJxgMgAZ!xij6fB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+0D+)@
zk~at?Mt}eT0t5&UAV7cs0RjXF5FkK+009C72oNAZfB=CE0!rQtSFi&C0t5&UAV7cs
z0RjXF5FkK+009C72oNAZfB*pk1cCxe-XN410RjXF5FkK+009C72oNAZfB*pk1PBly
zK!5-N0t7M$D0wqn!43on5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U2nr~9gHU1w
z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5Xc~)<jrseI}jj1fB*pk1PBlyK!5-N
z0t5&UAV7cs0RjXF5FkJxD4^sGLWvO|K!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ
zAcH`#<bAkZwC-ySmancgmW$S|n~zgFhl{PZ+0OVsPbN2?beGx-;+qm=e9ibG@QuTF
z0zW<cIQSp^POy8ZDEr1vAHQpAd++hFiF5OJ-!s$u?P+R9dHI3r$7i3o_2bd<?ROg0
z!Arl?27euGRuey5tPM6^ul%E<<+1N;gVjU*)z9(Um$kun&kj@*ADpcXjvN@QCJwCE
z20wR)s);Y&uMOV&y*7B|WLbU5%9YyS(y`%c;*(3Y!HL7IYU0K7wZWf<wp9}szN!t@
zpBt$rj(%DjJace+HF0gdHu!e`j%wn}>Du7%AGN_#r$(zU`RPh+@cEl#)x^7(YlF2T
z<JH9e3$?+;m)q6E$6wb5Z#{ooHSxk{wZZj+w^tMAH)?~AR_>@K-Z)bmJbk@3`1RDC
z)t7v8_13}kNIBAOG^Pd~dGzspbJP9hV7r+6@X6itgJtjU<zjx<&By=zugU#u`xZK@
idmbDpUmF^0PIS9t)6HUJ=0tmLXysa8-_qX2$-e<?uJaiH

literal 0
HcmV?d00001


From 4ce6674641def5a68dce633d3a21f17438ae48de Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 6 May 2024 14:43:20 -0500
Subject: [PATCH 167/272] Return `int64` when pandas compatible mode is turned
 on for `get_indexer` (#15659)

Fixes: #15658

This PR makes a change to `get_indexer` to return `int64` indices when pandas compatible mode is turned on.

Forks out of https://github.com/rapidsai/cudf/pull/14534

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15659
---
 python/cudf/cudf/core/_base_index.py |  6 ++++++
 python/cudf/cudf/core/index.py       |  8 ++++----
 python/cudf/cudf/core/multiindex.py  |  7 ++++---
 python/cudf/cudf/tests/test_index.py | 20 ++++++++++++++++++++
 4 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index b5630ff9a54..fe0f39f9d0a 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -2205,3 +2205,9 @@ def _split(self, splits):
 
 def _get_result_name(left_name, right_name):
     return left_name if _is_same_name(left_name, right_name) else None
+
+
+def _return_get_indexer_result(result):
+    if cudf.get_option("mode.pandas_compatible"):
+        return result.astype("int64")
+    return result
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index b51751a1b55..a2ad10a0590 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -38,7 +38,7 @@
     is_list_like,
     is_scalar,
 )
-from cudf.core._base_index import BaseIndex
+from cudf.core._base_index import BaseIndex, _return_get_indexer_result
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.column import (
     CategoricalColumn,
@@ -1256,11 +1256,11 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
         )
 
         if not len(self):
-            return result.values
+            return _return_get_indexer_result(result.values)
         try:
             lcol, rcol = _match_join_keys(needle, self._column, "inner")
         except ValueError:
-            return result.values
+            return _return_get_indexer_result(result.values)
 
         scatter_map, indices = libcudf.join.join([lcol], [rcol], how="inner")
         (result,) = libcudf.copying.scatter([indices], scatter_map, [result])
@@ -1287,7 +1287,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
                 "{['ffill'/'pad', 'bfill'/'backfill', 'nearest', None]}"
             )
 
-        return result_series.to_cupy()
+        return _return_get_indexer_result(result_series.to_cupy())
 
     @_cudf_nvtx_annotate
     def get_loc(self, key):
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 1ab42df111f..c3184f51a4c 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -23,6 +23,7 @@
 from cudf.api.extensions import no_default
 from cudf.api.types import is_integer, is_list_like, is_object_dtype
 from cudf.core import column
+from cudf.core._base_index import _return_get_indexer_result
 from cudf.core.frame import Frame
 from cudf.core.index import (
     BaseIndex,
@@ -1858,11 +1859,11 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
             dtype=libcudf.types.size_type_dtype,
         )
         if not len(self):
-            return result.values
+            return _return_get_indexer_result(result.values)
         try:
             target = cudf.MultiIndex.from_tuples(target)
         except TypeError:
-            return result.values
+            return _return_get_indexer_result(result.values)
 
         join_keys = [
             _match_join_keys(lcol, rcol, "inner")
@@ -1892,7 +1893,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
                 "{['ffill'/'pad', 'bfill'/'backfill', None]}"
             )
 
-        return result_series.to_cupy()
+        return _return_get_indexer_result(result_series.to_cupy())
 
     @_cudf_nvtx_annotate
     def get_loc(self, key):
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 104a5fc0ffa..4ff1beb0a9a 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -1741,6 +1741,10 @@ def test_get_indexer_single_unique_numeric(idx, key, method):
 
         assert_eq(expected, got)
 
+        with cudf.option_context("mode.pandas_compatible", True):
+            got = gi.get_indexer(key, method=method)
+        assert_eq(expected, got, check_dtype=True)
+
 
 @pytest.mark.parametrize(
     "idx",
@@ -1770,6 +1774,12 @@ def test_get_indexer_rangeindex(idx, key, method, tolerance):
 
     assert_eq(expected, got)
 
+    with cudf.option_context("mode.pandas_compatible", True):
+        got = gi.get_indexer(
+            key, method=method, tolerance=None if method is None else tolerance
+        )
+    assert_eq(expected, got, check_dtype=True)
+
 
 @pytest.mark.parametrize(
     "idx",
@@ -1950,6 +1960,11 @@ def test_get_indexer_single_duplicate_string(idx, key, method):
 
         assert_eq(expected, got)
 
+        with cudf.option_context("mode.pandas_compatible", True):
+            got = gi.get_indexer(key, method=method)
+
+        assert_eq(expected, got, check_dtype=True)
+
 
 @pytest.mark.parametrize(
     "idx",
@@ -2009,6 +2024,11 @@ def test_get_indexer_multi_numeric(idx, key, method):
 
     assert_eq(expected, got)
 
+    with cudf.option_context("mode.pandas_compatible", True):
+        got = gi.get_indexer(key, method=method)
+
+    assert_eq(expected, got, check_dtype=True)
+
 
 @pytest.mark.parametrize(
     "idx",

From bc3071ed9c730333f71c7143f1a53abda42b0b56 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 6 May 2024 15:12:06 -0700
Subject: [PATCH 168/272] Improve distinct join with set `retrieve` (#15636)

This PR updates the distinct join to use `static_set::retrieve` instead of the custom device code.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Karthikeyan (https://github.com/karthikeyann)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/15636
---
 .../cudf/detail/distinct_hash_join.cuh        |  12 +-
 cpp/src/join/distinct_hash_join.cu            | 192 ++----------------
 2 files changed, 19 insertions(+), 185 deletions(-)

diff --git a/cpp/include/cudf/detail/distinct_hash_join.cuh b/cpp/include/cudf/detail/distinct_hash_join.cuh
index 93d52d5dda3..de3d23e9470 100644
--- a/cpp/include/cudf/detail/distinct_hash_join.cuh
+++ b/cpp/include/cudf/detail/distinct_hash_join.cuh
@@ -85,16 +85,10 @@ struct hasher_adapter {
 template <cudf::has_nested HasNested>
 struct distinct_hash_join {
  private:
-  /// Row equality type for nested columns
-  using nested_row_equal = cudf::experimental::row::equality::strong_index_comparator_adapter<
-    cudf::experimental::row::equality::device_row_comparator<true, cudf::nullate::DYNAMIC>>;
-  /// Row equality type for flat columns
-  using flat_row_equal = cudf::experimental::row::equality::strong_index_comparator_adapter<
-    cudf::experimental::row::equality::device_row_comparator<false, cudf::nullate::DYNAMIC>>;
-
   /// Device row equal type
-  using d_equal_type =
-    std::conditional_t<HasNested == cudf::has_nested::YES, nested_row_equal, flat_row_equal>;
+  using d_equal_type = cudf::experimental::row::equality::strong_index_comparator_adapter<
+    cudf::experimental::row::equality::device_row_comparator<HasNested == cudf::has_nested::YES,
+                                                             cudf::nullate::DYNAMIC>>;
   using hasher              = hasher_adapter<thrust::identity<hash_value_type>>;
   using probing_scheme_type = cuco::linear_probing<1, hasher>;
   using cuco_storage_type   = cuco::storage<1>;
diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index a3652942973..ad401bdccba 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -46,8 +46,6 @@ namespace cudf {
 namespace detail {
 namespace {
 
-static auto constexpr DISTINCT_JOIN_BLOCK_SIZE = 256;
-
 template <cudf::has_nested HasNested>
 auto prepare_device_equal(
   std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> build,
@@ -82,7 +80,7 @@ class build_keys_fn {
 
 /**
  * @brief Device output transform functor to construct `size_type` with `cuco::pair<hash_value_type,
- * lhs_index_type>`
+ * lhs_index_type>` or `cuco::pair<hash_value_type, rhs_index_type>`
  */
 struct output_fn {
   __device__ constexpr cudf::size_type operator()(
@@ -90,167 +88,12 @@ struct output_fn {
   {
     return static_cast<cudf::size_type>(x.second);
   }
-};
-
-template <typename Tile>
-__device__ void flush_buffer(Tile const& tile,
-                             cudf::size_type tile_count,
-                             cuco::pair<cudf::size_type, cudf::size_type>* buffer,
-                             cudf::size_type* counter,
-                             cudf::size_type* build_indices,
-                             cudf::size_type* probe_indices)
-{
-  cudf::size_type offset;
-  auto const lane_id = tile.thread_rank();
-  if (0 == lane_id) { offset = atomicAdd(counter, tile_count); }
-  offset = tile.shfl(offset, 0);
-
-  for (cudf::size_type i = lane_id; i < tile_count; i += tile.size()) {
-    auto const& [build_idx, probe_idx] = buffer[i];
-    *(build_indices + offset + i)      = build_idx;
-    *(probe_indices + offset + i)      = probe_idx;
-  }
-}
-
-__device__ void flush_buffer(cooperative_groups::thread_block const& block,
-                             cudf::size_type buffer_size,
-                             cuco::pair<cudf::size_type, cudf::size_type>* buffer,
-                             cudf::size_type* counter,
-                             cudf::size_type* build_indices,
-                             cudf::size_type* probe_indices)
-{
-  auto i = block.thread_rank();
-  __shared__ cudf::size_type offset;
-
-  if (i == 0) { offset = atomicAdd(counter, buffer_size); }
-  block.sync();
-
-  while (i < buffer_size) {
-    auto const& [build_idx, probe_idx] = buffer[i];
-    *(build_indices + offset + i)      = build_idx;
-    *(probe_indices + offset + i)      = probe_idx;
-
-    i += block.size();
-  }
-}
-
-// TODO: custom kernel to be replaced by cuco::static_set::retrieve
-template <typename Iter, typename HashTable>
-CUDF_KERNEL void distinct_join_probe_kernel(Iter iter,
-                                            cudf::size_type n,
-                                            HashTable hash_table,
-                                            cudf::size_type* counter,
-                                            cudf::size_type* build_indices,
-                                            cudf::size_type* probe_indices)
-{
-  namespace cg = cooperative_groups;
-
-  auto constexpr tile_size   = HashTable::cg_size;
-  auto constexpr window_size = HashTable::window_size;
-
-  auto idx          = cudf::detail::grid_1d::global_thread_id() / tile_size;
-  auto const stride = cudf::detail::grid_1d::grid_stride() / tile_size;
-  auto const block  = cg::this_thread_block();
-
-  // CG-based probing algorithm
-  if constexpr (tile_size != 1) {
-    auto const tile = cg::tiled_partition<tile_size>(block);
-
-    auto constexpr flushing_tile_size = cudf::detail::warp_size / window_size;
-    // random choice to tune
-    auto constexpr flushing_buffer_size = 2 * flushing_tile_size;
-    auto constexpr num_flushing_tiles   = DISTINCT_JOIN_BLOCK_SIZE / flushing_tile_size;
-    auto constexpr max_matches          = flushing_tile_size / tile_size;
-
-    auto const flushing_tile    = cg::tiled_partition<flushing_tile_size>(block);
-    auto const flushing_tile_id = block.thread_rank() / flushing_tile_size;
-
-    __shared__ cuco::pair<cudf::size_type, cudf::size_type>
-      flushing_tile_buffer[num_flushing_tiles][flushing_tile_size];
-    // per flushing-tile counter to track number of filled elements
-    __shared__ cudf::size_type flushing_counter[num_flushing_tiles];
-
-    if (flushing_tile.thread_rank() == 0) { flushing_counter[flushing_tile_id] = 0; }
-    flushing_tile.sync();  // sync still needed since cg.any doesn't imply a memory barrier
-
-    while (flushing_tile.any(idx < n)) {
-      bool active_flag = idx < n;
-      auto const active_flushing_tile =
-        cg::binary_partition<flushing_tile_size>(flushing_tile, active_flag);
-      if (active_flag) {
-        auto const found = hash_table.find(tile, *(iter + idx));
-        if (tile.thread_rank() == 0 and found != hash_table.end()) {
-          auto const offset = atomicAdd_block(&flushing_counter[flushing_tile_id], 1);
-          flushing_tile_buffer[flushing_tile_id][offset] = cuco::pair{
-            static_cast<cudf::size_type>(found->second), static_cast<cudf::size_type>(idx)};
-        }
-      }
-
-      flushing_tile.sync();
-      if (flushing_counter[flushing_tile_id] + max_matches > flushing_buffer_size) {
-        flush_buffer(flushing_tile,
-                     flushing_counter[flushing_tile_id],
-                     flushing_tile_buffer[flushing_tile_id],
-                     counter,
-                     build_indices,
-                     probe_indices);
-        flushing_tile.sync();
-        if (flushing_tile.thread_rank() == 0) { flushing_counter[flushing_tile_id] = 0; }
-        flushing_tile.sync();
-      }
-
-      idx += stride;
-    }  // while
-
-    if (flushing_counter[flushing_tile_id] > 0) {
-      flush_buffer(flushing_tile,
-                   flushing_counter[flushing_tile_id],
-                   flushing_tile_buffer[flushing_tile_id],
-                   counter,
-                   build_indices,
-                   probe_indices);
-    }
-  }
-  // Scalar probing for CG size 1
-  else {
-    using block_scan = cub::BlockScan<cudf::size_type, DISTINCT_JOIN_BLOCK_SIZE>;
-    __shared__ typename block_scan::TempStorage block_scan_temp_storage;
-
-    auto constexpr buffer_capacity = 2 * DISTINCT_JOIN_BLOCK_SIZE;
-    __shared__ cuco::pair<cudf::size_type, cudf::size_type> buffer[buffer_capacity];
-    cudf::size_type buffer_size = 0;
-
-    while (idx - block.thread_rank() < n) {  // the whole thread block falls into the same iteration
-      auto const found     = idx < n ? hash_table.find(*(iter + idx)) : hash_table.end();
-      auto const has_match = found != hash_table.end();
-
-      // Use a whole-block scan to calculate the output location
-      cudf::size_type offset;
-      cudf::size_type block_count;
-      block_scan(block_scan_temp_storage)
-        .ExclusiveSum(static_cast<cudf::size_type>(has_match), offset, block_count);
-
-      if (buffer_size + block_count > buffer_capacity) {
-        flush_buffer(block, buffer_size, buffer, counter, build_indices, probe_indices);
-        block.sync();
-        buffer_size = 0;
-      }
-
-      if (has_match) {
-        buffer[buffer_size + offset] = cuco::pair{static_cast<cudf::size_type>(found->second),
-                                                  static_cast<cudf::size_type>(idx)};
-      }
-      buffer_size += block_count;
-      block.sync();
-
-      idx += stride;
-    }  // while
-
-    if (buffer_size > 0) {
-      flush_buffer(block, buffer_size, buffer, counter, build_indices, probe_indices);
-    }
+  __device__ constexpr cudf::size_type operator()(
+    cuco::pair<hash_value_type, rhs_index_type> const& x) const
+  {
+    return static_cast<cudf::size_type>(x.second);
   }
-}
+};
 }  // namespace
 
 template <cudf::has_nested HasNested>
@@ -332,19 +175,16 @@ distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
   auto const d_probe_hasher = probe_row_hasher.device_hasher(nullate::DYNAMIC{this->_has_nulls});
   auto const iter           = cudf::detail::make_counting_transform_iterator(
     0, build_keys_fn<decltype(d_probe_hasher), rhs_index_type>{d_probe_hasher});
-  auto counter = rmm::device_scalar<cudf::size_type>{stream};
-  counter.set_value_to_zero_async(stream);
-
-  cudf::detail::grid_1d grid{probe_table_num_rows, DISTINCT_JOIN_BLOCK_SIZE};
-  distinct_join_probe_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-    iter,
-    probe_table_num_rows,
-    this->_hash_table.ref(cuco::find),
-    counter.data(),
-    build_indices->data(),
-    probe_indices->data());
-
-  auto const actual_size = counter.value(stream);
+
+  auto const build_indices_begin =
+    thrust::make_transform_output_iterator(build_indices->begin(), output_fn{});
+  auto const probe_indices_begin =
+    thrust::make_transform_output_iterator(probe_indices->begin(), output_fn{});
+
+  auto const [probe_indices_end, _] = this->_hash_table.retrieve(
+    iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, stream.value());
+
+  auto const actual_size = std::distance(probe_indices_begin, probe_indices_end);
   build_indices->resize(actual_size, stream);
   probe_indices->resize(actual_size, stream);
 

From dcd0d6b97da07db36f5c8c9fa0e33ac54dcbcaf0 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 6 May 2024 18:05:58 -0500
Subject: [PATCH 169/272] Remove host_parse_nested_json. (#15674)

This PR addresses a task from #15537 to remove the `host_parse_nested_json` code path and corresponding tests. See discussion in https://github.com/rapidsai/cudf/pull/15568#issuecomment-2067024223.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/15674
---
 cpp/src/io/json/nested_json.hpp   | 23 +++++++------------
 cpp/src/io/json/read_json.cu      |  1 -
 cpp/tests/io/nested_json_test.cpp | 37 +++++++++++--------------------
 3 files changed, 21 insertions(+), 40 deletions(-)

diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 52ea23c7f1c..5817a01c21f 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -302,9 +302,16 @@ reduce_to_column_tree(tree_meta_t& tree,
 cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& options,
                                         rmm::cuda_stream_view stream);
 
-/** @copydoc host_parse_nested_json
+/**
+ * @brief Parses the given JSON string and generates table from the given input.
+ *
  * All processing is done in device memory.
  *
+ * @param input The JSON input
+ * @param options Parsing options specifying the parsing behaviour
+ * @param stream The CUDA stream to which kernels are dispatched
+ * @param mr Optional, resource with which to allocate
+ * @return The data parsed from the given JSON input
  */
 table_with_metadata device_parse_nested_json(device_span<SymbolT const> input,
                                              cudf::io::json_reader_options const& options,
@@ -337,20 +344,6 @@ struct path_from_tree {
   std::vector<path_rep> get_path(NodeIndexT this_col_id);
 };
 
-/**
- * @brief Parses the given JSON string and generates table from the given input.
- *
- * @param input The JSON input
- * @param options Parsing options specifying the parsing behaviour
- * @param stream The CUDA stream to which kernels are dispatched
- * @param mr Optional, resource with which to allocate
- * @return The data parsed from the given JSON input
- */
-table_with_metadata host_parse_nested_json(device_span<SymbolT const> input,
-                                           cudf::io::json_reader_options const& options,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::device_async_resource_ref mr);
-
 }  // namespace detail
 
 }  // namespace cudf::io::json
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 89c301ec055..0ead5c56264 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -307,7 +307,6 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
     cudf::device_span<char const>(reinterpret_cast<char const*>(bufview.data()), bufview.size());
   stream.synchronize();
   return device_parse_nested_json(buffer, reader_opts, stream, mr);
-  // For debug purposes, use host_parse_nested_json()
 }
 
 }  // namespace cudf::io::json::detail
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp
index 2e2d5cae34c..112ee8fb57b 100644
--- a/cpp/tests/io/nested_json_test.cpp
+++ b/cpp/tests/io/nested_json_test.cpp
@@ -620,15 +620,12 @@ TEST_F(JsonTest, TokenStream2)
   }
 }
 
-struct JsonParserTest : public cudf::test::BaseFixture, public testing::WithParamInterface<bool> {};
-INSTANTIATE_TEST_SUITE_P(IsFullGPU, JsonParserTest, testing::Bool());
+struct JsonParserTest : public cudf::test::BaseFixture {};
 
-TEST_P(JsonParserTest, ExtractColumn)
+TEST_F(JsonParserTest, ExtractColumn)
 {
   using cuio_json::SymbolT;
-  bool const is_full_gpu = GetParam();
-  auto json_parser       = is_full_gpu ? cuio_json::detail::device_parse_nested_json
-                                       : cuio_json::detail::host_parse_nested_json;
+  auto json_parser = cuio_json::detail::device_parse_nested_json;
 
   // Prepare cuda stream for data transfers & kernels
   auto const stream = cudf::get_default_stream();
@@ -867,14 +864,12 @@ TEST_F(JsonTest, PostProcessTokenStream)
   }
 }
 
-TEST_P(JsonParserTest, UTF_JSON)
+TEST_F(JsonParserTest, UTF_JSON)
 {
   // Prepare cuda stream for data transfers & kernels
-  auto const stream      = cudf::get_default_stream();
-  auto mr                = rmm::mr::get_current_device_resource();
-  bool const is_full_gpu = GetParam();
-  auto json_parser       = is_full_gpu ? cuio_json::detail::device_parse_nested_json
-                                       : cuio_json::detail::host_parse_nested_json;
+  auto const stream = cudf::get_default_stream();
+  auto mr           = rmm::mr::get_current_device_resource();
+  auto json_parser  = cuio_json::detail::device_parse_nested_json;
 
   // Default parsing options
   cudf::io::json_reader_options default_options{};
@@ -924,12 +919,10 @@ TEST_P(JsonParserTest, UTF_JSON)
   CUDF_EXPECT_NO_THROW(json_parser(d_utf_pass, default_options, stream, mr));
 }
 
-TEST_P(JsonParserTest, ExtractColumnWithQuotes)
+TEST_F(JsonParserTest, ExtractColumnWithQuotes)
 {
   using cuio_json::SymbolT;
-  bool const is_full_gpu = GetParam();
-  auto json_parser       = is_full_gpu ? cuio_json::detail::device_parse_nested_json
-                                       : cuio_json::detail::host_parse_nested_json;
+  auto json_parser = cuio_json::detail::device_parse_nested_json;
 
   // Prepare cuda stream for data transfers & kernels
   auto const stream = cudf::get_default_stream();
@@ -959,12 +952,10 @@ TEST_P(JsonParserTest, ExtractColumnWithQuotes)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col2, parsed_col2);
 }
 
-TEST_P(JsonParserTest, ExpectFailMixStructAndList)
+TEST_F(JsonParserTest, ExpectFailMixStructAndList)
 {
   using cuio_json::SymbolT;
-  bool const is_full_gpu = GetParam();
-  auto json_parser       = is_full_gpu ? cuio_json::detail::device_parse_nested_json
-                                       : cuio_json::detail::host_parse_nested_json;
+  auto json_parser = cuio_json::detail::device_parse_nested_json;
 
   // Prepare cuda stream for data transfers & kernels
   auto const stream = cudf::get_default_stream();
@@ -1002,12 +993,10 @@ TEST_P(JsonParserTest, ExpectFailMixStructAndList)
   }
 }
 
-TEST_P(JsonParserTest, EmptyString)
+TEST_F(JsonParserTest, EmptyString)
 {
   using cuio_json::SymbolT;
-  bool const is_full_gpu = GetParam();
-  auto json_parser       = is_full_gpu ? cuio_json::detail::device_parse_nested_json
-                                       : cuio_json::detail::host_parse_nested_json;
+  auto json_parser = cuio_json::detail::device_parse_nested_json;
 
   // Prepare cuda stream for data transfers & kernels
   auto const stream = cudf::get_default_stream();

From 5f02cb8e9f5f6431196ac188029c29690518e3f7 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Mon, 6 May 2024 18:38:05 -0500
Subject: [PATCH 170/272] Migrate string `find` operations to `pylibcudf`
 (#15604)

This PR implements libcudf's string `find.hpp` and migrates existing cuDF cython to leverage it.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15604
---
 python/cudf/cudf/_lib/cpp/strings/find.pxd    |   7 +-
 .../_lib/pylibcudf/strings/CMakeLists.txt     |   2 +-
 .../cudf/_lib/pylibcudf/strings/__init__.pxd  |   2 +-
 .../cudf/_lib/pylibcudf/strings/__init__.py   |   2 +-
 .../cudf/cudf/_lib/pylibcudf/strings/find.pxd |  38 +++
 .../cudf/cudf/_lib/pylibcudf/strings/find.pyx | 277 ++++++++++++++++++
 python/cudf/cudf/_lib/strings/find.pyx        | 174 +++--------
 .../cudf/pylibcudf_tests/test_string_find.py  | 262 +++++++++++++++++
 8 files changed, 633 insertions(+), 131 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/find.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/find.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_string_find.py

diff --git a/python/cudf/cudf/_lib/cpp/strings/find.pxd b/python/cudf/cudf/_lib/cpp/strings/find.pxd
index 953d5c30b2a..dfbdebb9651 100644
--- a/python/cudf/cudf/_lib/cpp/strings/find.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/find.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
@@ -41,6 +41,11 @@ cdef extern from "cudf/strings/find.hpp" namespace "cudf::strings" nogil:
         size_type start,
         size_type stop) except +
 
+    cdef unique_ptr[column] find(
+        column_view source_strings,
+        column_view target,
+        size_type start) except +
+
     cdef unique_ptr[column] rfind(
         column_view source_strings,
         string_scalar target,
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
index 3a2a9e1e7eb..c42b57ece63 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources case.pyx)
+set(cython_sources case.pyx find.pyx)
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
   CXX
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
index ff87549b5b5..33e2d56c087 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
@@ -1,3 +1,3 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import case
+from . cimport case, find
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
index ff87549b5b5..9220f6bd045 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
@@ -1,3 +1,3 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import case
+from . import case, find
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd
new file mode 100644
index 00000000000..22e933106c7
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd
@@ -0,0 +1,38 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.scalar cimport Scalar
+
+ctypedef fused ColumnOrScalar:
+    Column
+    Scalar
+
+cpdef Column find(
+    Column input,
+    ColumnOrScalar target,
+    size_type start=*,
+    size_type stop=*
+)
+
+cpdef Column rfind(
+    Column input,
+    Scalar target,
+    size_type start=*,
+    size_type stop=*
+)
+
+cpdef Column contains(
+    Column input,
+    ColumnOrScalar target,
+)
+
+cpdef Column starts_with(
+    Column input,
+    ColumnOrScalar target,
+)
+
+cpdef Column ends_with(
+    Column input,
+    ColumnOrScalar target,
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/find.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/find.pyx
new file mode 100644
index 00000000000..1d94132a8b3
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/find.pyx
@@ -0,0 +1,277 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.strings cimport find as cpp_find
+from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.scalar cimport Scalar
+
+from cython.operator import dereference
+
+from cudf._lib.cpp.scalar.scalar cimport string_scalar
+
+
+cpdef Column find(
+    Column input,
+    ColumnOrScalar target,
+    size_type start=0,
+    size_type stop=-1
+):
+    """Returns a column of character position values where the target string is
+    first found in each string of the provided column.
+
+    ``target`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    If ``target`` is a scalar, the scalar will be searched for in each string.
+    If ``target`` is a column, the corresponding string in the column will be
+    searched for in each string.
+
+    For details, see :cpp:func:`find`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    target : Union[Column, Scalar]
+        String to search for in each string
+    start : size_type
+        First character position to include in the search
+    stop : size_type
+        Last position (exclusive) to include in the search. Default of -1 will
+        search to the end of the string.
+
+    Returns
+    -------
+    pylibcudf.Column
+        New integer column with character position values
+    """
+    cdef unique_ptr[column] result
+    if ColumnOrScalar is Column:
+        with nogil:
+            result = move(
+                cpp_find.find(
+                    input.view(),
+                    target.view(),
+                    start
+                )
+            )
+    elif ColumnOrScalar is Scalar:
+        with nogil:
+            result = move(
+                cpp_find.find(
+                    input.view(),
+                    dereference(<string_scalar*>(target.c_obj.get())),
+                    start,
+                    stop
+                )
+            )
+    else:
+        raise ValueError(f"Invalid target {target}")
+
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column rfind(
+    Column input,
+    Scalar target,
+    size_type start=0,
+    size_type stop=-1
+):
+    """
+    Returns a column of character position values where the target string is
+    first found searching from the end of each string.
+
+    For details, see :cpp:func:`rfind`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    target : Scalar
+        String to search for in each string
+    start : size_type
+        First character position to include in the search
+    stop : size_type
+        Last position (exclusive) to include in the search. Default of -1 will
+        search to the end of the string.
+
+    Returns
+    -------
+    pylibcudf.Column
+        New integer column with character position values
+    """
+    cdef unique_ptr[column] result
+    with nogil:
+        result = move(
+            cpp_find.rfind(
+                input.view(),
+                dereference(<string_scalar*>(target.c_obj.get())),
+                start,
+                stop
+            )
+        )
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column contains(
+    Column input,
+    ColumnOrScalar target,
+):
+    """
+    Returns a column of boolean values for each string where true indicates the
+    corresponding target string was found within that string in the provided
+    column.
+
+    ``target`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    If ``target`` is a scalar, the scalar will be searched for in each string.
+    If ``target`` is a column, the corresponding string in the column will be
+    searched for in each string.
+
+    For details, see :cpp:func:`contains`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    target : Union[Column, Scalar]
+        String to search for in each string
+
+    Returns
+    -------
+    pylibcudf.Column
+        New boolean column with True for each string that contains the target
+    """
+    cdef unique_ptr[column] result
+    if ColumnOrScalar is Column:
+        with nogil:
+            result = move(
+                cpp_find.contains(
+                    input.view(),
+                    target.view()
+                )
+            )
+    elif ColumnOrScalar is Scalar:
+        with nogil:
+            result = move(
+                cpp_find.contains(
+                    input.view(),
+                    dereference(<string_scalar*>(target.c_obj.get()))
+                )
+            )
+    else:
+        raise ValueError(f"Invalid target {target}")
+
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column starts_with(
+    Column input,
+    ColumnOrScalar target,
+):
+    """
+    Returns a column of boolean values for each string where true indicates the
+    target string was found at the beginning of the string in the provided
+    column.
+
+    ``target`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    If ``target`` is a scalar, the scalar will be searched for in each string.
+    If ``target`` is a column, the corresponding string in the column will be
+    searched for in each string.
+
+    For details, see :cpp:func:`starts_with`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    target : Union[Column, Scalar]
+        String to search for at the beginning of each string
+
+    Returns
+    -------
+    pylibcudf.Column
+        New boolean column with True for each string that starts with the target
+    """
+    cdef unique_ptr[column] result
+
+    if ColumnOrScalar is Column:
+        with nogil:
+            result = move(
+                cpp_find.starts_with(
+                    input.view(),
+                    target.view()
+                )
+            )
+    elif ColumnOrScalar is Scalar:
+        with nogil:
+            result = move(
+                cpp_find.starts_with(
+                    input.view(),
+                    dereference(<string_scalar*>(target.c_obj.get()))
+                )
+            )
+    else:
+        raise ValueError(f"Invalid target {target}")
+
+    return Column.from_libcudf(move(result))
+
+cpdef Column ends_with(
+    Column input,
+    ColumnOrScalar target,
+):
+    """
+    Returns a column of boolean values for each string where true indicates the
+    target string was found at the end of the string in the provided column.
+
+    ``target`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    If ``target`` is a scalar, the scalar will be searched for in each string.
+    If ``target`` is a column, the corresponding string in the column will be
+    searched for in each string.
+
+    For details, see :cpp:func:`ends_with`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    target : Union[Column, Scalar]
+        String to search for at the end of each string
+
+    Returns
+    -------
+    pylibcudf.Column
+        New boolean column with True for each string that ends with the target
+    """
+    cdef unique_ptr[column] result
+    if ColumnOrScalar is Column:
+        with nogil:
+            result = move(
+                cpp_find.ends_with(
+                    input.view(),
+                    target.view()
+                )
+            )
+    elif ColumnOrScalar is Scalar:
+        with nogil:
+            result = move(
+                cpp_find.ends_with(
+                    input.view(),
+                    dereference(<string_scalar*>(target.c_obj.get()))
+                )
+            )
+    else:
+        raise ValueError(f"Invalid target {target}")
+
+    return Column.from_libcudf(move(result))
diff --git a/python/cudf/cudf/_lib/strings/find.pyx b/python/cudf/cudf/_lib/strings/find.pyx
index f6dd3b80de9..341776b102c 100644
--- a/python/cudf/cudf/_lib/strings/find.pyx
+++ b/python/cudf/cudf/_lib/strings/find.pyx
@@ -1,23 +1,10 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+import cudf._lib.pylibcudf as plc
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.find cimport (
-    contains as cpp_contains,
-    ends_with as cpp_ends_with,
-    find as cpp_find,
-    rfind as cpp_rfind,
-    starts_with as cpp_starts_with,
-)
 from cudf._lib.cpp.types cimport size_type
-from cudf._lib.scalar cimport DeviceScalar
 
 
 @acquire_spill_lock()
@@ -26,23 +13,13 @@ def contains(Column source_strings, object py_target):
     Returns a Column of boolean values with True for `source_strings`
     that contain the pattern given in `py_target`.
     """
-    cdef DeviceScalar target = py_target.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        target.get_raw_ptr()
+    return Column.from_pylibcudf(
+        plc.strings.find.contains(
+            source_strings.to_pylibcudf(mode="read"),
+            py_target.device_value.c_value
+        )
     )
 
-    with nogil:
-        c_result = move(cpp_contains(
-            source_view,
-            scalar_str[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
-
 
 @acquire_spill_lock()
 def contains_multiple(Column source_strings, Column target_strings):
@@ -50,17 +27,12 @@ def contains_multiple(Column source_strings, Column target_strings):
     Returns a Column of boolean values with True for `source_strings`
     that contain the corresponding string in `target_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view target_view = target_strings.view()
-
-    with nogil:
-        c_result = move(cpp_contains(
-            source_view,
-            target_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.find.contains(
+            source_strings.to_pylibcudf(mode="read"),
+            target_strings.to_pylibcudf(mode="read")
+        )
+    )
 
 
 @acquire_spill_lock()
@@ -70,23 +42,13 @@ def endswith(Column source_strings, object py_target):
     that contain strings that end with the pattern given in `py_target`.
     """
 
-    cdef DeviceScalar target = py_target.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        target.get_raw_ptr()
+    return Column.from_pylibcudf(
+        plc.strings.find.ends_with(
+            source_strings.to_pylibcudf(mode="read"),
+            py_target.device_value.c_value
+        )
     )
 
-    with nogil:
-        c_result = move(cpp_ends_with(
-            source_view,
-            scalar_str[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
-
 
 @acquire_spill_lock()
 def endswith_multiple(Column source_strings, Column target_strings):
@@ -95,17 +57,12 @@ def endswith_multiple(Column source_strings, Column target_strings):
     that contain strings that end with corresponding location
     in `target_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view target_view = target_strings.view()
-
-    with nogil:
-        c_result = move(cpp_ends_with(
-            source_view,
-            target_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.find.ends_with(
+            source_strings.to_pylibcudf(mode="read"),
+            target_strings.to_pylibcudf(mode="read")
+        )
+    )
 
 
 @acquire_spill_lock()
@@ -114,24 +71,13 @@ def startswith(Column source_strings, object py_target):
     Returns a Column of boolean values with True for `source_strings`
     that contain strings that start with the pattern given in `py_target`.
     """
-
-    cdef DeviceScalar target = py_target.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        target.get_raw_ptr()
+    return Column.from_pylibcudf(
+        plc.strings.find.starts_with(
+            source_strings.to_pylibcudf(mode="read"),
+            py_target.device_value.c_value
+        )
     )
 
-    with nogil:
-        c_result = move(cpp_starts_with(
-            source_view,
-            scalar_str[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
-
 
 @acquire_spill_lock()
 def startswith_multiple(Column source_strings, Column target_strings):
@@ -140,17 +86,12 @@ def startswith_multiple(Column source_strings, Column target_strings):
     that contain strings that begin with corresponding location
     in `target_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view target_view = target_strings.view()
-
-    with nogil:
-        c_result = move(cpp_starts_with(
-            source_view,
-            target_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.find.starts_with(
+            source_strings.to_pylibcudf(mode="read"),
+            target_strings.to_pylibcudf(mode="read")
+        )
+    )
 
 
 @acquire_spill_lock()
@@ -164,25 +105,14 @@ def find(Column source_strings,
     Scan portion of strings in `source_strings` can be
     controlled by setting `start` and `end` values.
     """
-
-    cdef DeviceScalar target = py_target.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        target.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_find(
-            source_view,
-            scalar_str[0],
+    return Column.from_pylibcudf(
+        plc.strings.find.find(
+            source_strings.to_pylibcudf(mode="read"),
+            py_target.device_value.c_value,
             start,
             end
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+        )
+    )
 
 
 @acquire_spill_lock()
@@ -197,21 +127,11 @@ def rfind(Column source_strings,
     controlled by setting `start` and `end` values.
     """
 
-    cdef DeviceScalar target = py_target.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        target.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_rfind(
-            source_view,
-            scalar_str[0],
+    return Column.from_pylibcudf(
+        plc.strings.find.rfind(
+            source_strings.to_pylibcudf(mode="read"),
+            py_target.device_value.c_value,
             start,
             end
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+        )
+    )
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_find.py b/python/cudf/cudf/pylibcudf_tests/test_string_find.py
new file mode 100644
index 00000000000..f44c4af9bfc
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_string_find.py
@@ -0,0 +1,262 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def pa_data_col():
+    return pa.array(
+        [
+            "abc123",
+            "ABC123",
+            "aBc123",
+            "",
+            " ",
+            None,
+            "a",
+            None,
+            "abc123",
+            "ABC123",
+            "aBc123",
+            "",
+            " ",
+            None,
+            "a",
+            None,
+            "abc123",
+            "ABC123",
+            "aBc123",
+            "",
+            " ",
+            None,
+            "a",
+            None,
+            "abc123",
+            "ABC123",
+            "aBc123",
+            "",
+            " ",
+            None,
+            "a",
+            None,
+            "abc123",
+            "ABC123",
+            "aBc123",
+            "",
+            " ",
+            None,
+            "a",
+            None,
+        ]
+    )
+
+
+@pytest.fixture(scope="module")
+def plc_data_col(pa_data_col):
+    return plc.interop.from_arrow(pa_data_col)
+
+
+@pytest.fixture(scope="module")
+def pa_target_col():
+    return pa.array(
+        [
+            "a",
+            "B",
+            "x",
+            "1",
+            " ",
+            "a",
+            None,
+            None,  # find
+            "a",
+            "B",
+            "x",
+            "1",
+            " ",
+            "a",
+            None,
+            None,  # rfind
+            "ab",
+            "12",
+            "BC",
+            "",
+            " ",
+            "a",
+            None,
+            None,  # contains
+            "ab",
+            "ABC",
+            "AB",
+            "",
+            " ",
+            "a",
+            None,
+            None,  # starts_with
+            "3",
+            "23",
+            "a23",
+            "",
+            " ",
+            "a",
+            None,
+            None,  # ends_with
+        ]
+    )
+
+
+@pytest.fixture(scope="module")
+def plc_target_col(pa_target_col):
+    return plc.interop.from_arrow(pa_target_col)
+
+
+@pytest.fixture(params=["a", " ", "A", "Ab", "23"], scope="module")
+def pa_target_scalar(request):
+    return pa.scalar(request.param, type=pa.string())
+
+
+@pytest.fixture(scope="module")
+def plc_target_scalar(pa_target_scalar):
+    return plc.interop.from_arrow(pa_target_scalar)
+
+
+def test_find(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar):
+    got = plc.strings.find.find(plc_data_col, plc_target_scalar, 0, -1)
+
+    expected = pa.array(
+        [
+            elem.find(pa_target_scalar.as_py()) if elem is not None else None
+            for elem in pa_data_col.to_pylist()
+        ],
+        type=pa.int32(),
+    )
+
+    assert_column_eq(got, expected)
+
+
+def colwise_apply(pa_data_col, pa_target_col, operator):
+    def handle_none(st, target):
+        # Match libcudf handling of nulls
+        if st is None:
+            return None
+        elif target is None:
+            return False
+        else:
+            return operator(st, target)
+
+    expected = pa.array(
+        [
+            handle_none(elem, target)
+            for elem, target in zip(
+                pa_data_col.to_pylist(),
+                pa_target_col.to_pylist(),
+            )
+        ],
+        type=pa.bool_(),
+    )
+
+    return expected
+
+
+def test_find_column(pa_data_col, pa_target_col, plc_data_col, plc_target_col):
+    expected = pa.array(
+        [
+            elem.find(target) if not (elem is None or target is None) else None
+            for elem, target in zip(
+                pa_data_col.to_pylist(),
+                pa_target_col.to_pylist(),
+            )
+        ],
+        type=pa.int32(),
+    )
+
+    got = plc.strings.find.find(plc_data_col, plc_target_col, 0)
+    assert_column_eq(got, expected)
+
+
+def test_rfind(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar):
+    py_target = pa_target_scalar.as_py()
+
+    got = plc.strings.find.rfind(plc_data_col, plc_target_scalar, 0, -1)
+
+    expected = pa.array(
+        [
+            elem.rfind(py_target)
+            if not (elem is None or py_target is None)
+            else None
+            for elem in pa_data_col.to_pylist()
+        ],
+        type=pa.int32(),
+    )
+
+    assert_column_eq(got, expected)
+
+
+def test_contains(
+    pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar
+):
+    py_target = pa_target_scalar.as_py()
+
+    got = plc.strings.find.contains(plc_data_col, plc_target_scalar)
+    expected = pa.array(
+        [
+            py_target in elem
+            if not (elem is None or py_target is None)
+            else None
+            for elem in pa_data_col.to_pylist()
+        ],
+        type=pa.bool_(),
+    )
+
+    assert_column_eq(got, expected)
+
+
+def test_contains_column(
+    pa_data_col, pa_target_col, plc_data_col, plc_target_col
+):
+    expected = colwise_apply(
+        pa_data_col, pa_target_col, lambda st, target: target in st
+    )
+    got = plc.strings.find.contains(plc_data_col, plc_target_col)
+    assert_column_eq(got, expected)
+
+
+def test_starts_with(
+    pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar
+):
+    py_target = pa_target_scalar.as_py()
+    got = plc.strings.find.starts_with(plc_data_col, plc_target_scalar)
+    expected = pa.compute.starts_with(pa_data_col, py_target)
+    assert_column_eq(got, expected)
+
+
+def test_starts_with_column(
+    pa_data_col, pa_target_col, plc_data_col, plc_target_col
+):
+    expected = colwise_apply(
+        pa_data_col, pa_target_col, lambda st, target: st.startswith(target)
+    )
+    got = plc.strings.find.starts_with(plc_data_col, plc_target_col)
+    assert_column_eq(got, expected)
+
+
+def test_ends_with(
+    pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar
+):
+    py_target = pa_target_scalar.as_py()
+    got = plc.strings.find.ends_with(plc_data_col, plc_target_scalar)
+    expected = pa.compute.ends_with(pa_data_col, py_target)
+    assert_column_eq(got, expected)
+
+
+def test_ends_with_column(
+    pa_data_col, pa_target_col, plc_data_col, plc_target_col
+):
+    expected = colwise_apply(
+        pa_data_col, pa_target_col, lambda st, target: st.endswith(target)
+    )
+    got = plc.strings.find.ends_with(plc_data_col, plc_target_col)
+    assert_column_eq(got, expected)

From d5ad366e9787999f00450ec858b5d18b813b3106 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 6 May 2024 18:51:17 -0500
Subject: [PATCH 171/272] Fix Index contains for error validations and float vs
 int comparisons (#15657)

Fixes: #15656

This PR:

- [x] Raises error for non-hashable values passed to `__contains__`
- [x] Fixes comparison of float values with int columns

Forks out of https://github.com/rapidsai/cudf/pull/14534

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15657
---
 python/cudf/cudf/core/_base_index.py      |  1 +
 python/cudf/cudf/core/column/numerical.py |  7 +++---
 python/cudf/cudf/core/index.py            |  2 ++
 python/cudf/cudf/tests/test_index.py      | 26 +++++++++++++++++++++++
 python/cudf/cudf/tests/test_multiindex.py | 12 +++++++++++
 5 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index fe0f39f9d0a..d2534acd2dc 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -275,6 +275,7 @@ def __getitem__(self, key):
         raise NotImplementedError()
 
     def __contains__(self, item):
+        hash(item)
         return item in self._values
 
     def _copy_type_metadata(
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 4c211a173b1..f6c7ca7675a 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -107,15 +107,14 @@ def __contains__(self, item: ScalarLike) -> bool:
         # Handles improper item types
         # Fails if item is of type None, so the handler.
         try:
-            if np.can_cast(item, self.dtype):
-                item = self.dtype.type(item)
-            else:
+            search_item = self.dtype.type(item)
+            if search_item != item and self.dtype.kind != "f":
                 return False
         except (TypeError, ValueError):
             return False
         # TODO: Use `scalar`-based `contains` wrapper
         return libcudf.search.contains(
-            self, column.as_column([item], dtype=self.dtype)
+            self, column.as_column([search_item], dtype=self.dtype)
         ).any()
 
     def indices_of(self, value: ScalarLike) -> NumericalColumn:
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index a2ad10a0590..f9dd328aaa8 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -344,6 +344,7 @@ def _data(self):
 
     @_cudf_nvtx_annotate
     def __contains__(self, item):
+        hash(item)
         if isinstance(item, bool) or not isinstance(
             item,
             tuple(
@@ -1523,6 +1524,7 @@ def values(self):
         return self._column.values
 
     def __contains__(self, item):
+        hash(item)
         return item in self._values
 
     def _clean_nulls_from_index(self):
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 4ff1beb0a9a..baa839ecd72 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -3223,3 +3223,29 @@ def test_rangeindex_dropna():
     result = ri.dropna()
     expected = ri.copy()
     assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("data", [range(2), [10, 11, 12]])
+def test_index_contains_hashable(data):
+    gidx = cudf.Index(data)
+    pidx = gidx.to_pandas()
+
+    assert_exceptions_equal(
+        lambda: [] in gidx,
+        lambda: [] in pidx,
+        lfunc_args_and_kwargs=((),),
+        rfunc_args_and_kwargs=((),),
+    )
+
+
+@pytest.mark.parametrize("data", [[0, 1, 2], [1.1, 2.3, 4.5]])
+@pytest.mark.parametrize("dtype", ["int32", "float32", "float64"])
+@pytest.mark.parametrize("needle", [0, 1, 2.3])
+def test_index_contains_float_int(data, dtype, needle):
+    gidx = cudf.Index(data=data, dtype=dtype)
+    pidx = gidx.to_pandas()
+
+    actual = needle in gidx
+    expected = needle in pidx
+
+    assert_eq(actual, expected)
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 76a82afb78e..dd731fab8f3 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -2153,3 +2153,15 @@ def test_index_to_pandas_arrow_type(scalar):
         levels=[pd.arrays.ArrowExtensionArray(pa_array)], codes=[[0]]
     )
     pd.testing.assert_index_equal(result, expected)
+
+
+def test_multi_index_contains_hashable():
+    gidx = cudf.MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3]))
+    pidx = gidx.to_pandas()
+
+    assert_exceptions_equal(
+        lambda: [] in gidx,
+        lambda: [] in pidx,
+        lfunc_args_and_kwargs=((),),
+        rfunc_args_and_kwargs=((),),
+    )

From c30495492be40989c9ff1d56087fa91a28ea469a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 7 May 2024 10:19:46 -0400
Subject: [PATCH 172/272] Check row limit size in cudf::strings::join_strings
 (#15643)

Fixes condition where `cudf::strings::join_strings` could produce column that exceeds the expected row width limit.
The `join_strings` API produces a single row column from the input column (plus optional separators).
Since a strings column total size can now exceed max(size_type) character bytes, it is now possible to produce invalid single-row column since individual rows still cannot exceed max(size_type).

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/15643
---
 cpp/src/strings/combine/join.cu | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index 4b2996a77e4..2e30e01df21 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -162,16 +162,16 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
     return std::move(*chars_data);
   }();
 
+  // API returns a single output row which cannot exceed row limit(max of size_type).
+  CUDF_EXPECTS(chars.size() < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
+               "The output exceeds the row size limit",
+               std::overflow_error);
+
   // build the offsets: single string output has offsets [0,chars-size]
   auto offsets_column = [&] {
-    if (chars.size() < static_cast<std::size_t>(get_offset64_threshold())) {
-      auto offsets32 = cudf::detail::make_device_uvector_async(
-        std::vector<int32_t>({0, static_cast<int32_t>(chars.size())}), stream, mr);
-      return std::make_unique<column>(std::move(offsets32), rmm::device_buffer{}, 0);
-    }
-    auto offsets64 = cudf::detail::make_device_uvector_async(
-      std::vector<int64_t>({0L, static_cast<int64_t>(chars.size())}), stream, mr);
-    return std::make_unique<column>(std::move(offsets64), rmm::device_buffer{}, 0);
+    auto offsets = cudf::detail::make_device_uvector_async(
+      std::vector<size_type>({0, static_cast<size_type>(chars.size())}), stream, mr);
+    return std::make_unique<column>(std::move(offsets), rmm::device_buffer{}, 0);
   }();
 
   // build the null mask: only one output row so it is either all-valid or all-null

From bd966141c9bb8b6dba3cabba3b8c8498203ed2ea Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 7 May 2024 16:22:33 +0100
Subject: [PATCH 173/272] Upgrade pre commit hooks (#15685)

The only really substantive change is to update mypy, which will be required for some type annotations in the cudf-polars work.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15685
---
 .pre-commit-config.yaml                    | 14 +++++++-------
 python/cudf/cudf/core/column/column.py     |  2 +-
 python/cudf/cudf/pandas/fast_slow_proxy.py |  5 ++---
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3e99cf3fa9a..0ae745257cb 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v4.6.0
     hooks:
       - id: trailing-whitespace
         exclude: |
@@ -24,11 +24,11 @@ repos:
         files: python/.*
         types_or: [python, cython, pyi]
   - repo: https://github.com/MarcoGorelli/cython-lint
-    rev: v0.16.0
+    rev: v0.16.2
     hooks:
       - id: cython-lint
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: 'v1.3.0'
+    rev: 'v1.10.0'
     hooks:
       - id: mypy
         additional_dependencies: [types-cachetools]
@@ -39,7 +39,7 @@ repos:
                "python/dask_cudf/dask_cudf"]
         pass_filenames: false
   - repo: https://github.com/nbQA-dev/nbQA
-    rev: 1.7.1
+    rev: 1.8.5
     hooks:
       - id: nbqa-isort
         # Use the cudf_kafka isort orderings in notebooks so that dask
@@ -52,7 +52,7 @@ repos:
         types_or: [c, c++, cuda]
         args: ["-fallback-style=none", "-style=file", "-i"]
   - repo: https://github.com/sirosen/texthooks
-    rev: 0.6.3
+    rev: 0.6.6
     hooks:
       - id: fix-smartquotes
         exclude: |
@@ -124,12 +124,12 @@ repos:
             ^CHANGELOG.md$
           )
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.8.0
+    rev: v1.13.4
     hooks:
       - id: rapids-dependency-file-generator
         args: ["--clean"]
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.3.4
+    rev: v0.4.3
     hooks:
       - id: ruff
         files: python/.*$
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index ba2dab2c2e1..553f4cc7fb3 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2211,7 +2211,7 @@ def _mask_from_cuda_array_interface_desc(obj, cai_mask) -> Buffer:
         raise NotImplementedError(f"Cannot infer mask from typestr {typestr}")
 
 
-def serialize_columns(columns) -> Tuple[List[dict], List]:
+def serialize_columns(columns: list[ColumnBase]) -> Tuple[List[dict], List]:
     """
     Return the headers and frames resulting
     from serializing a list of Column
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index 9d8c174b297..835cfa89133 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -1093,7 +1093,7 @@ def _replace_closurevars(
     f: types.FunctionType,
     attribute_name: Literal["_fsproxy_slow", "_fsproxy_fast"],
     seen: Set[int],
-) -> types.FunctionType:
+) -> Callable[..., Any]:
     """
     Return a copy of `f` with its closure variables replaced with
     their corresponding slow (or fast) types.
@@ -1133,12 +1133,11 @@ def _replace_closurevars(
         argdefs=f.__defaults__,
         closure=g_closure,
     )
-    g = functools.update_wrapper(
+    return functools.update_wrapper(
         g,
         f,
         assigned=functools.WRAPPER_ASSIGNMENTS + ("__kwdefaults__",),
     )
-    return g
 
 
 _SPECIAL_METHODS: Set[str] = {

From 2e818575bb913c04f94c669c3a7555b5131e0639 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 7 May 2024 11:16:03 -0500
Subject: [PATCH 174/272] Fix CI s3 api command to fetch latest results
 (#15687)

An `s3` fetch api seems to be returning incorrectly ordered output, this PR fixes the issue.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15687
---
 ci/cudf_pandas_scripts/pandas-tests/diff.sh | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/ci/cudf_pandas_scripts/pandas-tests/diff.sh b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
index f87a3a36fcc..6cf70a2347f 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/diff.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
@@ -17,10 +17,8 @@ MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-${RAPIDS_FULL_VER
 PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-${RAPIDS_FULL_VERSION}-results.json
 
 rapids-logger "Fetching latest available results from nightly"
-aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt
-
-read -r COMPARE_ENV < s3_output.txt
-export COMPARE_ENV
+aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json')], &LastModified)[::].[Key]" --output text  | tee s3_output.txt
+COMPARE_ENV=$(tail -n 1 s3_output.txt)
 rapids-logger "Latest available results from nightly: ${COMPARE_ENV}"
 
 aws s3 cp "s3://rapids-downloads/${COMPARE_ENV}" main-results.json

From 0cfdbc135556a4b51f4521429e19309d7ce586f9 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Tue, 7 May 2024 11:29:00 -0700
Subject: [PATCH 175/272] Fix decoding of dictionary encoded
 FIXED_LEN_BYTE_ARRAY data in Parquet reader (#15601)

Reading Parquet files with dictionary encoded FIXED_LEN_BYTE_ARRAY data fails because the dictionary page is never parsed, leading to out-of-bounds memory accesses.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15601
---
 cpp/src/io/parquet/page_decode.cuh           | 10 +++++++---
 cpp/src/io/parquet/page_hdr.cu               | 21 +++++++++++++++-----
 cpp/src/io/parquet/reader_impl_preprocess.cu | 15 +++++++++++---
 python/cudf/cudf/tests/test_parquet.py       | 19 ++++++++++++++++++
 4 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 4c811449c70..b1f8e6dd5fe 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -1298,9 +1298,13 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
       // be made to is_supported_encoding() in reader_impl_preprocess.cu
       switch (s->page.encoding) {
         case Encoding::PLAIN_DICTIONARY:
-        case Encoding::RLE_DICTIONARY:
+        case Encoding::RLE_DICTIONARY: {
           // RLE-packed dictionary indices, first byte indicates index length in bits
-          if (s->col.physical_type == BYTE_ARRAY && s->col.str_dict_index != nullptr) {
+          auto const is_decimal =
+            s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL;
+          if ((s->col.physical_type == BYTE_ARRAY or
+               s->col.physical_type == FIXED_LEN_BYTE_ARRAY) and
+              not is_decimal and s->col.str_dict_index != nullptr) {
             // String dictionary: use index
             s->dict_base = reinterpret_cast<uint8_t const*>(s->col.str_dict_index);
             s->dict_size = s->col.dict_page->num_input_values * sizeof(string_index_pair);
@@ -1314,7 +1318,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           if (s->dict_bits > 32 || (!s->dict_base && s->col.dict_page->num_input_values > 0)) {
             s->set_error_code(decode_error::INVALID_DICT_WIDTH);
           }
-          break;
+        } break;
         case Encoding::PLAIN:
         case Encoding::BYTE_STREAM_SPLIT:
           s->dict_size = static_cast<int32_t>(end - cur);
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 6c6afde29e4..cf0dd85e490 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -538,17 +538,28 @@ CUDF_KERNEL void __launch_bounds__(128)
     int pos = 0, cur = 0;
     for (int i = 0; i < num_entries; i++) {
       int len = 0;
-      if (cur + 4 <= dict_size) {
-        len = dict[cur + 0] | (dict[cur + 1] << 8) | (dict[cur + 2] << 16) | (dict[cur + 3] << 24);
-        if (len >= 0 && cur + 4 + len <= dict_size) {
+      if (ck->physical_type == FIXED_LEN_BYTE_ARRAY) {
+        if (cur + ck->type_length <= dict_size) {
+          len = ck->type_length;
           pos = cur;
-          cur = cur + 4 + len;
+          cur += len;
         } else {
           cur = dict_size;
         }
+      } else {
+        if (cur + 4 <= dict_size) {
+          len =
+            dict[cur + 0] | (dict[cur + 1] << 8) | (dict[cur + 2] << 16) | (dict[cur + 3] << 24);
+          if (len >= 0 && cur + 4 + len <= dict_size) {
+            pos = cur + 4;
+            cur = pos + len;
+          } else {
+            cur = dict_size;
+          }
+        }
       }
       // TODO: Could store 8 entries in shared mem, then do a single warp-wide store
-      dict_index[i].first  = reinterpret_cast<char const*>(dict + pos + 4);
+      dict_index[i].first  = reinterpret_cast<char const*>(dict + pos);
       dict_index[i].second = len;
     }
   }
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 8c9b3c1a1e6..55633b97cf4 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -636,6 +636,15 @@ void decode_page_headers(pass_intermediate_data& pass,
   stream.synchronize();
 }
 
+constexpr bool is_string_chunk(ColumnChunkDesc const& chunk)
+{
+  auto const is_decimal =
+    chunk.logical_type.has_value() and chunk.logical_type->type == LogicalType::DECIMAL;
+  auto const is_binary =
+    chunk.physical_type == BYTE_ARRAY or chunk.physical_type == FIXED_LEN_BYTE_ARRAY;
+  return is_binary and not is_decimal;
+}
+
 struct set_str_dict_index_count {
   device_span<size_t> str_dict_index_count;
   device_span<const ColumnChunkDesc> chunks;
@@ -643,8 +652,8 @@ struct set_str_dict_index_count {
   __device__ void operator()(PageInfo const& page)
   {
     auto const& chunk = chunks[page.chunk_idx];
-    if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) && chunk.physical_type == BYTE_ARRAY &&
-        (chunk.num_dict_pages > 0)) {
+    if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0 and chunk.num_dict_pages > 0 and
+        is_string_chunk(chunk)) {
       // there is only ever one dictionary page per chunk, so this is safe to do in parallel.
       str_dict_index_count[page.chunk_idx] = page.num_input_values;
     }
@@ -659,7 +668,7 @@ struct set_str_dict_index_ptr {
   __device__ void operator()(size_t i)
   {
     auto& chunk = chunks[i];
-    if (chunk.physical_type == BYTE_ARRAY && (chunk.num_dict_pages > 0)) {
+    if (chunk.num_dict_pages > 0 and is_string_chunk(chunk)) {
       chunk.str_dict_index = base + str_dict_index_offsets[i];
     }
   }
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 6fb1d3d8ba5..f1b90b40991 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2,6 +2,7 @@
 
 import datetime
 import glob
+import hashlib
 import math
 import os
 import pathlib
@@ -2807,6 +2808,24 @@ def test_parquet_reader_fixed_bin(datadir):
     assert_eq(expect, got)
 
 
+def test_parquet_reader_fixed_len_with_dict(tmpdir):
+    def flba(i):
+        hasher = hashlib.sha256()
+        hasher.update(i.to_bytes(4, "little"))
+        return hasher.digest()
+
+    # use pyarrow to write table of fixed_len_byte_array
+    num_rows = 200
+    data = pa.array([flba(i) for i in range(num_rows)], type=pa.binary(32))
+    padf = pa.Table.from_arrays([data], names=["flba"])
+    padf_fname = tmpdir.join("padf.parquet")
+    pq.write_table(padf, padf_fname, use_dictionary=True)
+
+    expect = pd.read_parquet(padf_fname)
+    got = cudf.read_parquet(padf_fname)
+    assert_eq(expect, got)
+
+
 def test_parquet_reader_rle_boolean(datadir):
     fname = datadir / "rle_boolean_encoding.parquet"
 

From e87a78d422e25474dd23b031ef98eeb8a293d718 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 7 May 2024 14:09:28 -0500
Subject: [PATCH 176/272] Allow `fillna` to validate for
 `CategoricalColumn.fillna` (#15683)

Fixes: #15666

This PR validates values passed to `fillna` even if there are no null values in a categorical column.

Forks from #14534

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15683
---
 python/cudf/cudf/core/column/categorical.py |  8 +++++---
 python/cudf/cudf/core/frame.py              | 15 +++++++++++----
 python/cudf/cudf/tests/test_categorical.py  | 16 ++++++++++++++++
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index e3e73035046..dc51cd4f28f 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1045,9 +1045,6 @@ def fillna(
         """
         Fill null values with *fill_value*
         """
-        if not self.nullable:
-            return self
-
         if fill_value is not None:
             fill_is_scalar = np.isscalar(fill_value)
 
@@ -1079,6 +1076,11 @@ def fillna(
                     self.codes.dtype
                 )
 
+        # Validation of `fill_value` will have to be performed
+        # before returning self.
+        if not self.nullable:
+            return self
+
         return super().fillna(fill_value, method=method)
 
     def indices_of(
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 017190ab5b4..58932db2bda 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -762,10 +762,17 @@ def fillna(
             else:
                 replace_val = None
             should_fill = (
-                col_name in value
-                and col.has_nulls(include_nan=True)
-                and not libcudf.scalar._is_null_host_scalar(replace_val)
-            ) or method is not None
+                (
+                    col_name in value
+                    and col.has_nulls(include_nan=True)
+                    and not libcudf.scalar._is_null_host_scalar(replace_val)
+                )
+                or method is not None
+                or (
+                    isinstance(col, cudf.core.column.CategoricalColumn)
+                    and not libcudf.scalar._is_null_host_scalar(replace_val)
+                )
+            )
             if should_fill:
                 filled_data[col_name] = col.fillna(replace_val, method)
             else:
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index 7aba2e45532..07ce81e3c39 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -859,3 +859,19 @@ def test_cat_from_scalar(scalar):
     gs = cudf.Series(scalar, dtype="category")
 
     assert_eq(ps, gs)
+
+
+def test_cat_groupby_fillna():
+    ps = pd.Series(["a", "b", "c"], dtype="category")
+    gs = cudf.from_pandas(ps)
+
+    with pytest.warns(FutureWarning):
+        pg = ps.groupby(ps)
+    gg = gs.groupby(gs)
+
+    assert_exceptions_equal(
+        lfunc=pg.fillna,
+        rfunc=gg.fillna,
+        lfunc_args_and_kwargs=(("d",), {}),
+        rfunc_args_and_kwargs=(("d",), {}),
+    )

From a958274d338fabac4cac63ec938ea273aa58490c Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 7 May 2024 14:33:43 -0500
Subject: [PATCH 177/272] Some additional kernel thread index refactoring.
 (#14107)

This PR refactors a few kernels to use `thread_index_type` and associated utilities. I started this before realizing how much scope was still left in issue #10368 ("Part 2 - Take another pass over more challenging kernels"), and then I stopped working on this due to time constraints. For the moment, I hope this PR makes a small dent in the number of remaining kernels to convert to using `thread_index_type`.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - Mark Harris (https://github.com/harrism)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/14107
---
 cpp/benchmarks/join/generate_input_tables.cuh | 17 ++++++----
 .../type_dispatcher/type_dispatcher.cu        | 33 +++++++++++--------
 cpp/include/cudf/detail/copy_if_else.cuh      | 23 ++++++-------
 cpp/include/cudf/detail/utilities/cuda.cuh    | 26 +++++++++++++++
 cpp/include/cudf/detail/valid_if.cuh          |  4 +--
 cpp/src/bitmask/null_mask.cu                  |  4 +--
 cpp/src/copying/concatenate.cu                |  4 +--
 cpp/src/join/conditional_join_kernels.cuh     | 10 +++---
 cpp/src/strings/convert/convert_urls.cu       | 18 +++++-----
 9 files changed, 88 insertions(+), 51 deletions(-)

diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh
index 93401f01026..f7984b29d6b 100644
--- a/cpp/benchmarks/join/generate_input_tables.cuh
+++ b/cpp/benchmarks/join/generate_input_tables.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -34,7 +35,7 @@
 
 CUDF_KERNEL void init_curand(curandState* state, int const nstates)
 {
-  int ithread = threadIdx.x + blockIdx.x * blockDim.x;
+  int ithread = cudf::detail::grid_1d::global_thread_id();
 
   if (ithread < nstates) { curand_init(1234ULL, ithread, 0, state + ithread); }
 }
@@ -46,13 +47,14 @@ CUDF_KERNEL void init_build_tbl(key_type* const build_tbl,
                                 curandState* state,
                                 int const num_states)
 {
-  auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  auto const stride    = blockDim.x * gridDim.x;
+  auto const start_idx = cudf::detail::grid_1d::global_thread_id();
+  auto const stride    = cudf::detail::grid_1d::grid_stride();
   assert(start_idx < num_states);
 
   curandState localState = state[start_idx];
 
-  for (size_type idx = start_idx; idx < build_tbl_size; idx += stride) {
+  for (cudf::thread_index_type tidx = start_idx; tidx < build_tbl_size; tidx += stride) {
+    auto const idx = static_cast<size_type>(tidx);
     double const x = curand_uniform_double(&localState);
 
     build_tbl[idx] = static_cast<key_type>(x * (build_tbl_size / multiplicity));
@@ -71,13 +73,14 @@ CUDF_KERNEL void init_probe_tbl(key_type* const probe_tbl,
                                 curandState* state,
                                 int const num_states)
 {
-  auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  auto const stride    = blockDim.x * gridDim.x;
+  auto const start_idx = cudf::detail::grid_1d::global_thread_id();
+  auto const stride    = cudf::detail::grid_1d::grid_stride();
   assert(start_idx < num_states);
 
   curandState localState = state[start_idx];
 
-  for (size_type idx = start_idx; idx < probe_tbl_size; idx += stride) {
+  for (cudf::thread_index_type tidx = start_idx; tidx < probe_tbl_size; tidx += stride) {
+    auto const idx = static_cast<size_type>(tidx);
     key_type val;
     double x = curand_uniform_double(&localState);
 
diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
index 161328ae088..3aff75d840e 100644
--- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
+++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
@@ -60,13 +60,15 @@ constexpr int block_size = 256;
 template <FunctorType functor_type, class T>
 CUDF_KERNEL void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_type n_cols)
 {
-  using F               = Functor<T, functor_type>;
-  cudf::size_type index = blockIdx.x * blockDim.x + threadIdx.x;
-  while (index < n_rows) {
+  using F           = Functor<T, functor_type>;
+  auto tidx         = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
+  while (tidx < n_rows) {
+    auto const index = static_cast<cudf::size_type>(tidx);
     for (int c = 0; c < n_cols; c++) {
       A[c][index] = F::f(A[c][index]);
     }
-    index += blockDim.x * gridDim.x;
+    tidx += stride;
   }
 }
 
@@ -74,12 +76,14 @@ CUDF_KERNEL void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size
 template <FunctorType functor_type, class T>
 CUDF_KERNEL void host_dispatching_kernel(cudf::mutable_column_device_view source_column)
 {
-  using F               = Functor<T, functor_type>;
-  T* A                  = source_column.data<T>();
-  cudf::size_type index = blockIdx.x * blockDim.x + threadIdx.x;
-  while (index < source_column.size()) {
-    A[index] = F::f(A[index]);
-    index += blockDim.x * gridDim.x;
+  using F           = Functor<T, functor_type>;
+  T* A              = source_column.data<T>();
+  auto tidx         = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
+  while (tidx < source_column.size()) {
+    auto const index = static_cast<cudf::size_type>(tidx);
+    A[index]         = F::f(A[index]);
+    tidx += stride;
   }
 }
 
@@ -127,14 +131,15 @@ template <FunctorType functor_type>
 CUDF_KERNEL void device_dispatching_kernel(cudf::mutable_table_device_view source)
 {
   cudf::size_type const n_rows = source.num_rows();
-  cudf::size_type index        = threadIdx.x + blockIdx.x * blockDim.x;
-
-  while (index < n_rows) {
+  auto tidx                    = cudf::detail::grid_1d::global_thread_id();
+  auto const stride            = cudf::detail::grid_1d::grid_stride();
+  while (tidx < n_rows) {
+    auto const index = static_cast<cudf::size_type>(tidx);
     for (cudf::size_type i = 0; i < source.num_columns(); i++) {
       cudf::type_dispatcher(
         source.column(i).type(), RowHandle<functor_type>{}, source.column(i), index);
     }
-    index += blockDim.x * gridDim.x;
+    tidx += stride;
   }  // while
 }
 
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index ac5cb0ad141..8418e279ce7 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -45,29 +45,30 @@ __launch_bounds__(block_size) CUDF_KERNEL
                            mutable_column_device_view out,
                            size_type* __restrict__ const valid_count)
 {
-  size_type const tid            = threadIdx.x + blockIdx.x * block_size;
-  int const warp_id              = tid / warp_size;
-  size_type const warps_per_grid = gridDim.x * block_size / warp_size;
+  auto tidx                      = cudf::detail::grid_1d::global_thread_id<block_size>();
+  auto const stride              = cudf::detail::grid_1d::grid_stride<block_size>();
+  int const warp_id              = tidx / cudf::detail::warp_size;
+  size_type const warps_per_grid = gridDim.x * block_size / cudf::detail::warp_size;
 
   // begin/end indices for the column data
-  size_type begin = 0;
-  size_type end   = out.size();
+  size_type const begin = 0;
+  size_type const end   = out.size();
   // warp indices.  since 1 warp == 32 threads == sizeof(bitmask_type) * 8,
   // each warp will process one (32 bit) of the validity mask via
   // __ballot_sync()
-  size_type warp_begin = cudf::word_index(begin);
-  size_type warp_end   = cudf::word_index(end - 1);
+  size_type const warp_begin = cudf::word_index(begin);
+  size_type const warp_end   = cudf::word_index(end - 1);
 
   // lane id within the current warp
   constexpr size_type leader_lane{0};
-  int const lane_id = threadIdx.x % warp_size;
+  int const lane_id = threadIdx.x % cudf::detail::warp_size;
 
   size_type warp_valid_count{0};
 
   // current warp.
   size_type warp_cur = warp_begin + warp_id;
-  size_type index    = tid;
   while (warp_cur <= warp_end) {
+    auto const index = static_cast<size_type>(tidx);
     auto const opt_value =
       (index < end) ? (filter(index) ? lhs[index] : rhs[index]) : thrust::nullopt;
     if (opt_value) { out.element<T>(index) = static_cast<T>(*opt_value); }
@@ -85,7 +86,7 @@ __launch_bounds__(block_size) CUDF_KERNEL
 
     // next grid
     warp_cur += warps_per_grid;
-    index += block_size * gridDim.x;
+    tidx += stride;
   }
 
   if (has_nulls) {
@@ -159,7 +160,7 @@ std::unique_ptr<column> copy_if_else(bool nullable,
   using Element = typename thrust::iterator_traits<LeftIter>::value_type::value_type;
 
   size_type size           = std::distance(lhs_begin, lhs_end);
-  size_type num_els        = cudf::util::round_up_safe(size, warp_size);
+  size_type num_els        = cudf::util::round_up_safe(size, cudf::detail::warp_size);
   constexpr int block_size = 256;
   cudf::detail::grid_1d grid{num_els, block_size, 1};
 
diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index 86c85ca8d06..f1775c6d6d7 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -93,6 +93,19 @@ class grid_1d {
     return global_thread_id(threadIdx.x, blockIdx.x, blockDim.x);
   }
 
+  /**
+   * @brief Returns the global thread index of the current thread in a 1D grid.
+   *
+   * @tparam num_threads_per_block The number of threads per block
+   *
+   * @return thread_index_type The global thread index
+   */
+  template <thread_index_type num_threads_per_block>
+  static __device__ thread_index_type global_thread_id()
+  {
+    return global_thread_id(threadIdx.x, blockIdx.x, num_threads_per_block);
+  }
+
   /**
    * @brief Returns the stride of a 1D grid.
    *
@@ -115,6 +128,19 @@ class grid_1d {
    * @return thread_index_type The number of threads in the grid.
    */
   static __device__ thread_index_type grid_stride() { return grid_stride(blockDim.x, gridDim.x); }
+
+  /**
+   * @brief Returns the stride of the current 1D grid.
+   *
+   * @tparam num_threads_per_block The number of threads per block
+   *
+   * @return thread_index_type The number of threads in the grid.
+   */
+  template <thread_index_type num_threads_per_block>
+  static __device__ thread_index_type grid_stride()
+  {
+    return grid_stride(num_threads_per_block, gridDim.x);
+  }
 };
 
 /**
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index 66163d6059a..64a3c4edf78 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -50,8 +50,8 @@ CUDF_KERNEL void valid_if_kernel(
 {
   constexpr size_type leader_lane{0};
   auto const lane_id{threadIdx.x % warp_size};
-  auto i            = cudf::detail::grid_1d::global_thread_id();
-  auto const stride = cudf::detail::grid_1d::grid_stride();
+  auto i            = cudf::detail::grid_1d::global_thread_id<block_size>();
+  auto const stride = cudf::detail::grid_1d::grid_stride<block_size>();
   size_type warp_valid_count{0};
 
   auto active_mask = __ballot_sync(0xFFFF'FFFFu, i < size);
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 4da2e502ce6..d0faeea8336 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -269,8 +269,8 @@ CUDF_KERNEL void count_set_bits_kernel(bitmask_type const* bitmask,
 
   auto const first_word_index{word_index(first_bit_index)};
   auto const last_word_index{word_index(last_bit_index)};
-  thread_index_type const tid         = grid_1d::global_thread_id();
-  thread_index_type const stride      = grid_1d::grid_stride();
+  thread_index_type const tid         = grid_1d::global_thread_id<block_size>();
+  thread_index_type const stride      = grid_1d::grid_stride<block_size>();
   thread_index_type thread_word_index = tid + first_word_index;
   size_type thread_count{0};
 
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index b1136a9eeb3..47e74a5cb48 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -121,8 +121,8 @@ CUDF_KERNEL void concatenate_masks_kernel(column_device_view const* views,
                                           size_type number_of_mask_bits,
                                           size_type* out_valid_count)
 {
-  auto tidx         = cudf::detail::grid_1d::global_thread_id();
-  auto const stride = cudf::detail::grid_1d::grid_stride();
+  auto tidx         = cudf::detail::grid_1d::global_thread_id<block_size>();
+  auto const stride = cudf::detail::grid_1d::grid_stride<block_size>();
   auto active_mask  = __ballot_sync(0xFFFF'FFFFu, tidx < number_of_mask_bits);
 
   size_type warp_valid_count = 0;
diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh
index 5e190eb2b27..1e16c451f5a 100644
--- a/cpp/src/join/conditional_join_kernels.cuh
+++ b/cpp/src/join/conditional_join_kernels.cuh
@@ -67,8 +67,8 @@ CUDF_KERNEL void compute_conditional_join_output_size(
     &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
 
   std::size_t thread_counter{0};
-  auto const start_idx = cudf::detail::grid_1d::global_thread_id();
-  auto const stride    = cudf::detail::grid_1d::grid_stride();
+  auto const start_idx = cudf::detail::grid_1d::global_thread_id<block_size>();
+  auto const stride    = cudf::detail::grid_1d::grid_stride<block_size>();
 
   cudf::thread_index_type const left_num_rows  = left_table.num_rows();
   cudf::thread_index_type const right_num_rows = right_table.num_rows();
@@ -174,7 +174,7 @@ CUDF_KERNEL void conditional_join(table_device_view left_table,
 
   __syncwarp();
 
-  auto outer_row_index = cudf::detail::grid_1d::global_thread_id();
+  auto outer_row_index = cudf::detail::grid_1d::global_thread_id<block_size>();
 
   unsigned int const activemask = __ballot_sync(0xffff'ffffu, outer_row_index < outer_num_rows);
 
@@ -295,8 +295,8 @@ CUDF_KERNEL void conditional_join_anti_semi(
   int const lane_id                            = threadIdx.x % detail::warp_size;
   cudf::thread_index_type const outer_num_rows = left_table.num_rows();
   cudf::thread_index_type const inner_num_rows = right_table.num_rows();
-  auto const stride                            = cudf::detail::grid_1d::grid_stride();
-  auto const start_idx                         = cudf::detail::grid_1d::global_thread_id();
+  auto const stride                            = cudf::detail::grid_1d::grid_stride<block_size>();
+  auto const start_idx = cudf::detail::grid_1d::global_thread_id<block_size>();
 
   if (0 == lane_id) { current_idx_shared[warp_id] = 0; }
 
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index 459c3e88a4e..d9920be045f 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -202,10 +202,11 @@ CUDF_KERNEL void url_decode_char_counter(column_device_view const in_strings,
   __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size];
   __shared__ typename cub::WarpReduce<int8_t>::TempStorage cub_storage[num_warps_per_threadblock];
 
-  auto const global_thread_id = cudf::detail::grid_1d::global_thread_id();
-  auto const global_warp_id   = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
-  auto const local_warp_id    = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
-  auto const warp_lane        = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
+  auto const global_thread_id =
+    cudf::detail::grid_1d::global_thread_id<num_warps_per_threadblock * cudf::detail::warp_size>();
+  auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
+  auto const local_warp_id  = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
+  auto const warp_lane      = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
   auto const nwarps     = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
   char* in_chars_shared = temporary_buffer[local_warp_id];
 
@@ -287,10 +288,11 @@ CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings,
   __shared__ typename cub::WarpScan<int8_t>::TempStorage cub_storage[num_warps_per_threadblock];
   __shared__ size_type out_idx[num_warps_per_threadblock];
 
-  auto const global_thread_id = cudf::detail::grid_1d::global_thread_id();
-  auto const global_warp_id   = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
-  auto const local_warp_id    = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
-  auto const warp_lane        = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
+  auto const global_thread_id =
+    cudf::detail::grid_1d::global_thread_id<num_warps_per_threadblock * cudf::detail::warp_size>();
+  auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
+  auto const local_warp_id  = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
+  auto const warp_lane      = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
   auto const nwarps     = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
   char* in_chars_shared = temporary_buffer[local_warp_id];
 

From 4c6593bc247006493b1b29918225620e0d4ecb65 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 7 May 2024 14:45:07 -0500
Subject: [PATCH 178/272] Add `NumpyExtensionArray` proxy type in `cudf.pandas`
 (#15686)

Fixes: #15678

This PR adds a proxy type for `NumpyExtensionArray`

Forks out from #14534

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15686
---
 python/cudf/cudf/pandas/_wrappers/pandas.py       | 12 ++++++++++++
 python/cudf/cudf_pandas_tests/test_cudf_pandas.py |  8 ++++++++
 2 files changed, 20 insertions(+)

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 3c82d571939..a4a0c24deda 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -310,6 +310,18 @@ def Index__new__(cls, *args, **kwargs):
     additional_attributes={"__init__": _DELETE},
 )
 
+NumpyExtensionArray = make_final_proxy_type(
+    "NumpyExtensionArray",
+    _Unusable,
+    pd.arrays.NumpyExtensionArray,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={
+        "_ndarray": _FastSlowAttribute("_ndarray"),
+        "_dtype": _FastSlowAttribute("_dtype"),
+    },
+)
+
 TimedeltaArray = make_final_proxy_type(
     "TimedeltaArray",
     _Unusable,
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 90356a01404..dff735cfd05 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1205,6 +1205,14 @@ def test_pickle_groupby(dataframe):
     tm.assert_equal(pgb.sum(), gb.sum())
 
 
+def test_numpy_extension_array():
+    np_array = np.array([0, 1, 2, 3])
+    xarray = xpd.arrays.NumpyExtensionArray(np_array)
+    array = pd.arrays.NumpyExtensionArray(np_array)
+
+    tm.assert_equal(xarray, array)
+
+
 def test_isinstance_base_offset():
     offset = xpd.tseries.frequencies.to_offset("1s")
     assert isinstance(offset, xpd.tseries.offsets.BaseOffset)

From 8d9c06a764900124446ca754d0d1555c3cb09904 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 7 May 2024 15:21:06 -0500
Subject: [PATCH 179/272] Add `Timestamp` and `Timedelta` proxy types (#15680)

Fixes: #15673

This PR adds `Timestamp` and `Timedelta` proxy types.


Forks out from: #14534

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15680
---
 python/cudf/cudf/pandas/_wrappers/pandas.py   | 43 +++++++++++++++++++
 .../cudf_pandas_tests/test_cudf_pandas.py     | 14 ++++++
 .../cudf/cudf_pandas_tests/test_profiler.py   |  4 +-
 3 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index a4a0c24deda..93bef66de4f 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -103,6 +103,49 @@ def __get__(self, obj, cls=None):
             raise AttributeError()
 
 
+def Timestamp_Timedelta__new__(cls, *args, **kwargs):
+    # Call fast/slow constructor
+    # This takes care of running __init__ as well, but must be paired
+    # with a removal of the defaulted __init__ that
+    # make_final_proxy_type provides.
+    # Timestamp & Timedelta don't always return same types as self,
+    # hence this method is needed.
+    self, _ = _fast_slow_function_call(
+        lambda cls, args, kwargs: cls(*args, **kwargs),
+        cls,
+        args,
+        kwargs,
+    )
+    return self
+
+
+Timedelta = make_final_proxy_type(
+    "Timedelta",
+    _Unusable,
+    pd.Timedelta,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={
+        "__hash__": _FastSlowAttribute("__hash__"),
+        "__new__": Timestamp_Timedelta__new__,
+        "__init__": _DELETE,
+    },
+)
+
+
+Timestamp = make_final_proxy_type(
+    "Timestamp",
+    _Unusable,
+    pd.Timestamp,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={
+        "__hash__": _FastSlowAttribute("__hash__"),
+        "__new__": Timestamp_Timedelta__new__,
+        "__init__": _DELETE,
+    },
+)
+
 DatetimeProperties = make_intermediate_proxy_type(
     "DatetimeProperties",
     cudf.core.series.DatetimeProperties,
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index dff735cfd05..8d319cfe640 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1228,3 +1228,17 @@ def my_apply(df, unused):
     result = df.apply(my_apply, axis=1, unused=True)
     expected = xpd.Series([1])
     tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("data", [pd.NaT, 1234, "nat"])
+def test_timestamp(data):
+    xtimestamp = xpd.Timestamp(data)
+    timestamp = pd.Timestamp(data)
+    tm.assert_equal(xtimestamp, timestamp)
+
+
+@pytest.mark.parametrize("data", [pd.NaT, 1234, "nat"])
+def test_timedelta(data):
+    xtimedelta = xpd.Timedelta(data)
+    timedelta = pd.Timedelta(data)
+    tm.assert_equal(xtimedelta, timedelta)
diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py
index 4921446ab6b..dd8d9287972 100644
--- a/python/cudf/cudf_pandas_tests/test_profiler.py
+++ b/python/cudf/cudf_pandas_tests/test_profiler.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -30,11 +30,13 @@ def test_profiler():
 
     per_function_stats = profiler.per_function_stats
     assert set(per_function_stats) == {
+        "Timestamp",
         "DataFrame",
         "DataFrame.groupby",
         "DataFrameGroupBy.sum",
         "DataFrame.sum",
         "Series.__getitem__",
+        "Timedelta",
     }
     for name, func in per_function_stats.items():
         assert (

From 5d244dfc13f4db0b1e41ded3029942fec50c98f6 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 7 May 2024 15:52:18 -0500
Subject: [PATCH 180/272] Preserve sub-second data for time scalars in column
 construction (#15655)

Fixes: #15654

This PR makes fixes such that sub-second timestamp data is not being dropped in column construction.

Forks out of https://github.com/rapidsai/cudf/pull/14534/

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15655
---
 python/cudf/cudf/_lib/scalar.pyx       |  3 ++-
 python/cudf/cudf/core/column/column.py | 13 ++++++++++++
 python/cudf/cudf/core/dataframe.py     |  2 +-
 python/cudf/cudf/core/scalar.py        |  3 +++
 python/cudf/cudf/tests/test_series.py  | 28 ++++++++++++++++++++++++++
 5 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 7ddf4ff4883..aee496e9f1c 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -354,7 +354,8 @@ def as_device_scalar(val, dtype=None):
 def _is_null_host_scalar(slr):
     if cudf.utils.utils.is_na_like(slr):
         return True
-    elif isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr):
+    elif (isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr)) or \
+            slr is pd.NaT:
         return True
     else:
         return False
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 553f4cc7fb3..e23da59b883 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2163,6 +2163,19 @@ def as_column(
                     nan_as_null=nan_as_null,
                     length=length,
                 )
+            elif (
+                isinstance(element, (pd.Timestamp, pd.Timedelta))
+                or element is pd.NaT
+            ):
+                # TODO: Remove this after
+                # https://github.com/apache/arrow/issues/26492
+                # is fixed.
+                return as_column(
+                    pd.Series(arbitrary),
+                    dtype=dtype,
+                    nan_as_null=nan_as_null,
+                    length=length,
+                )
             elif not any(element is na for na in (None, pd.NA, np.nan)):
                 # Might have NA + element like above, but short-circuit if
                 # an element pyarrow/pandas might be able to parse
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index bf8201e4dc1..6fa957684e4 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1215,7 +1215,7 @@ def dtypes(self):
         >>> df.dtypes
         float              float64
         int                  int64
-        datetime    datetime64[us]
+        datetime    datetime64[ns]
         string              object
         dtype: object
         """
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index f7d05e53ce7..29460d8c67e 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -223,6 +223,9 @@ def _preprocess_host_value(self, value, dtype):
 
         if dtype is None:
             if not valid:
+                if value is NaT:
+                    value = value.to_numpy()
+
                 if isinstance(value, (np.datetime64, np.timedelta64)):
                     unit, _ = np.datetime_data(value)
                     if unit == "generic":
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 642dbde3790..6a9de197374 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2786,3 +2786,31 @@ def test_squeeze(axis, data):
 def test_squeeze_invalid_axis(axis):
     with pytest.raises(ValueError):
         cudf.Series([1]).squeeze(axis=axis)
+
+
+@pytest.mark.parametrize("data", [None, 123, 33243243232423, 0])
+def test_timestamp_series_init(data):
+    scalar = pd.Timestamp(data)
+    expected = pd.Series([scalar])
+    actual = cudf.Series([scalar])
+
+    assert_eq(expected, actual)
+
+    expected = pd.Series(scalar)
+    actual = cudf.Series(scalar)
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("data", [None, 123, 33243243232423, 0])
+def test_timedelta_series_init(data):
+    scalar = pd.Timedelta(data)
+    expected = pd.Series([scalar])
+    actual = cudf.Series([scalar])
+
+    assert_eq(expected, actual)
+
+    expected = pd.Series(scalar)
+    actual = cudf.Series(scalar)
+
+    assert_eq(expected, actual)

From 5154661ae48074f9e781f95f74bce560b30ab00a Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 7 May 2024 13:07:38 -1000
Subject: [PATCH 181/272] Avoid accessing attributes via `_column` if not
 needed (#15624)

xref https://github.com/rapidsai/cudf/pull/15494

If the attributes are exposed on the top level object e.g. `Index.dtype` it should be sufficient to just access the attributes there instead of reaching for the underlying object

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15624
---
 python/cudf/cudf/core/algorithms.py      |   4 +-
 python/cudf/cudf/core/dataframe.py       |   4 +-
 python/cudf/cudf/core/index.py           |   2 +-
 python/cudf/cudf/core/indexed_frame.py   |   2 +-
 python/cudf/cudf/core/series.py          | 111 +++++++++--------------
 python/cudf/cudf/core/tools/datetimes.py |   3 +-
 6 files changed, 48 insertions(+), 78 deletions(-)

diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index 33cec21caa5..272abdece9e 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -142,10 +142,10 @@ def _index_or_values_interpolation(column, index=None):
         BooleanMask(~mask, len(to_interp))
     )
 
-    known_x = known_x_and_y._index._column.values
+    known_x = known_x_and_y.index.to_cupy()
     known_y = known_x_and_y._data.columns[0].values
 
-    result = cp.interp(to_interp._index.values, known_x, known_y)
+    result = cp.interp(index.to_cupy(), known_x, known_y)
 
     # find the first nan
     first_nan_idx = (mask == 0).argmax().item()
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 6fa957684e4..6928425a867 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1768,7 +1768,7 @@ def _concat(
                 indices[:first_data_column_position],
             )
             if not isinstance(out._index, MultiIndex) and isinstance(
-                out._index._values.dtype, cudf.CategoricalDtype
+                out._index.dtype, cudf.CategoricalDtype
             ):
                 out = out.set_index(
                     cudf.core.index.as_index(out.index._values)
@@ -3582,7 +3582,7 @@ def rename(
         if index:
             if (
                 any(isinstance(item, str) for item in index.values())
-                and type(self.index._values) != cudf.core.column.StringColumn
+                and self.index.dtype != "object"
             ):
                 raise NotImplementedError(
                     "Implicit conversion of index to "
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index f9dd328aaa8..52322b0160f 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2882,7 +2882,7 @@ def __init__(
 
     @property
     def closed(self):
-        return self._values.dtype.closed
+        return self.dtype.closed
 
     @classmethod
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 62ee780ebbb..e656fd49758 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -174,7 +174,7 @@ def _indices_from_labels(obj, labels):
 
         if isinstance(obj.index.dtype, cudf.CategoricalDtype):
             labels = labels.astype("category")
-            codes = labels.codes.astype(obj.index._values.codes.dtype)
+            codes = labels.codes.astype(obj.index.codes.dtype)
             labels = cudf.core.column.build_categorical_column(
                 categories=labels.dtype.categories,
                 codes=codes,
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index c3d232aaa7c..63a49a898f4 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -39,11 +39,9 @@
     _is_scalar_or_zero_d_array,
     is_bool_dtype,
     is_dict_like,
-    is_float_dtype,
     is_integer,
     is_integer_dtype,
     is_scalar,
-    is_string_dtype,
 )
 from cudf.core import indexing_utils
 from cudf.core._compat import PANDAS_LT_300
@@ -205,19 +203,10 @@ def __setitem__(self, key, value):
         if is_scalar(value):
             value = to_cudf_compatible_scalar(value)
             if (
-                not isinstance(
-                    self._frame._column,
-                    (
-                        cudf.core.column.DatetimeColumn,
-                        cudf.core.column.TimeDeltaColumn,
-                    ),
-                )
+                self._frame.dtype.kind not in "mM"
                 and cudf.utils.utils._isnat(value)
                 and not (
-                    isinstance(
-                        self._frame._column, cudf.core.column.StringColumn
-                    )
-                    and isinstance(value, str)
+                    self._frame.dtype == "object" and isinstance(value, str)
                 )
             ):
                 raise MixedTypeError(
@@ -226,14 +215,10 @@ def __setitem__(self, key, value):
                 )
             elif (
                 not (
-                    is_float_dtype(self._frame._column.dtype)
+                    self._frame.dtype.kind == "f"
                     or (
-                        isinstance(
-                            self._frame._column.dtype, cudf.CategoricalDtype
-                        )
-                        and is_float_dtype(
-                            self._frame._column.dtype.categories.dtype
-                        )
+                        isinstance(self._frame.dtype, cudf.CategoricalDtype)
+                        and self._frame.dtype.categories.dtype.kind == "f"
                     )
                 )
                 and isinstance(value, (np.float32, np.float64))
@@ -241,40 +226,37 @@ def __setitem__(self, key, value):
             ):
                 raise MixedTypeError(
                     f"Cannot assign {value=} to "
-                    f"non-float dtype={self._frame._column.dtype}"
+                    f"non-float dtype={self._frame.dtype}"
                 )
             elif (
-                is_bool_dtype(self._frame._column.dtype)
+                self._frame.dtype.kind == "b"
                 and not is_bool_dtype(value)
                 and value not in {None, cudf.NA}
             ):
                 raise MixedTypeError(
                     f"Cannot assign {value=} to "
-                    f"bool dtype={self._frame._column.dtype}"
+                    f"bool dtype={self._frame.dtype}"
                 )
         elif not (
             isinstance(value, (list, dict))
             and isinstance(
-                self._frame._column.dtype, (cudf.ListDtype, cudf.StructDtype)
+                self._frame.dtype, (cudf.ListDtype, cudf.StructDtype)
             )
         ):
             value = as_column(value)
 
         if (
-            (
-                _is_non_decimal_numeric_dtype(self._frame._column.dtype)
-                or is_string_dtype(self._frame._column.dtype)
-            )
+            (self._frame.dtype.kind in "uifb" or self._frame.dtype == "object")
             and hasattr(value, "dtype")
-            and _is_non_decimal_numeric_dtype(value.dtype)
+            and value.dtype.kind in "uifb"
         ):
             # normalize types if necessary:
             # In contrast to Column.__setitem__ (which downcasts the value to
             # the dtype of the column) here we upcast the series to the
             # larger data type mimicking pandas
-            to_dtype = np.result_type(value.dtype, self._frame._column.dtype)
+            to_dtype = np.result_type(value.dtype, self._frame.dtype)
             value = value.astype(to_dtype)
-            if to_dtype != self._frame._column.dtype:
+            if to_dtype != self._frame.dtype:
                 # Do not remove until pandas-3.0 support is added.
                 assert (
                     PANDAS_LT_300
@@ -283,7 +265,7 @@ def __setitem__(self, key, value):
                     f"Setting an item of incompatible dtype is deprecated "
                     "and will raise in a future error of pandas. "
                     f"Value '{value}' has dtype incompatible with "
-                    f"{self._frame._column.dtype}, "
+                    f"{self._frame.dtype}, "
                     "please explicitly cast to a compatible dtype first.",
                     FutureWarning,
                 )
@@ -336,27 +318,27 @@ def __setitem__(self, key, value):
                 and not isinstance(self._frame.index, cudf.MultiIndex)
                 and is_scalar(value)
             ):
-                # TODO: Modifying index in place is bad because
-                # our index are immutable, but columns are not (which
-                # means our index are mutable with internal APIs).
-                # Get rid of the deep copy once columns too are
-                # immutable.
-                idx_copy = self._frame._index.copy(deep=True)
-                if (
-                    isinstance(idx_copy, cudf.RangeIndex)
-                    and isinstance(key, int)
-                    and (key == idx_copy[-1] + idx_copy.step)
-                ):
-                    idx_copy = cudf.RangeIndex(
-                        start=idx_copy.start,
-                        stop=idx_copy.stop + idx_copy.step,
-                        step=idx_copy.step,
-                        name=idx_copy.name,
-                    )
+                idx = self._frame._index
+                if isinstance(idx, cudf.RangeIndex):
+                    if isinstance(key, int) and (key == idx[-1] + idx.step):
+                        idx_copy = cudf.RangeIndex(
+                            start=idx.start,
+                            stop=idx.stop + idx.step,
+                            step=idx.step,
+                            name=idx.name,
+                        )
+                    else:
+                        idx_copy = idx._as_int_index()
+                        _append_new_row_inplace(idx_copy._column, key)
                 else:
-                    if isinstance(idx_copy, cudf.RangeIndex):
-                        idx_copy = idx_copy._as_int_index()
-                    _append_new_row_inplace(idx_copy._values, key)
+                    # TODO: Modifying index in place is bad because
+                    # our index are immutable, but columns are not (which
+                    # means our index are mutable with internal APIs).
+                    # Get rid of the deep copy once columns too are
+                    # immutable.
+                    idx_copy = idx.copy(deep=True)
+                    _append_new_row_inplace(idx_copy._column, key)
+
                 self._frame._index = idx_copy
                 _append_new_row_inplace(self._frame._column, value)
                 return
@@ -1407,34 +1389,23 @@ def __repr__(self):
                     cudf.core.dtypes.DecimalDtype,
                 ),
             )
-        ) or isinstance(
-            preprocess._column,
-            cudf.core.column.timedelta.TimeDeltaColumn,
-        ):
+        ) or preprocess.dtype.kind == "m":
             fill_value = (
                 str(cudf.NaT)
-                if isinstance(
-                    preprocess._column,
-                    (
-                        cudf.core.column.TimeDeltaColumn,
-                        cudf.core.column.DatetimeColumn,
-                    ),
-                )
+                if preprocess.dtype.kind in "mM"
                 else str(cudf.NA)
             )
             output = repr(
                 preprocess.astype("str").fillna(fill_value).to_pandas()
             )
-        elif isinstance(
-            preprocess._column, cudf.core.column.CategoricalColumn
-        ):
+        elif isinstance(preprocess.dtype, cudf.CategoricalDtype):
             min_rows = (
                 height
                 if pd.get_option("display.min_rows") == 0
                 else pd.get_option("display.min_rows")
             )
             show_dimensions = pd.get_option("display.show_dimensions")
-            if preprocess._column.categories.dtype.kind == "f":
+            if preprocess.dtype.categories.dtype.kind == "f":
                 pd_series = (
                     preprocess.astype("str")
                     .to_pandas()
@@ -1461,13 +1432,13 @@ def __repr__(self):
             output = repr(preprocess.to_pandas())
 
         lines = output.split("\n")
-        if isinstance(preprocess._column, cudf.core.column.CategoricalColumn):
+        if isinstance(preprocess.dtype, cudf.CategoricalDtype):
             category_memory = lines[-1]
-            if preprocess._column.categories.dtype.kind == "f":
+            if preprocess.dtype.categories.dtype.kind == "f":
                 category_memory = category_memory.replace("'", "").split(": ")
                 category_memory = (
                     category_memory[0].replace(
-                        "object", preprocess._column.categories.dtype.name
+                        "object", preprocess.dtype.categories.dtype.name
                     )
                     + ": "
                     + category_memory[1]
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 7f6ce1100ea..12a1ecc68e0 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -1058,8 +1058,7 @@ def _to_iso_calendar(arg):
         )
     if isinstance(arg, cudf.Index):
         iso_params = [
-            arg._column.as_string_column(arg._values.dtype, fmt)
-            for fmt in formats
+            arg._column.as_string_column(arg.dtype, fmt) for fmt in formats
         ]
         index = arg._column
     elif isinstance(arg.series, cudf.Series):

From d29af846ed7f881d7cedccd07f147bde39218101 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Tue, 7 May 2024 17:10:51 -0700
Subject: [PATCH 182/272] Rework some python tests of Parquet delta encodings
 (#15693)

test_parquet.py currently takes around 55s to run on an RTXA6000 system. A large portion of that run time is in two tests of the Parquet DELTA_LENGTH_BYTE_ARRAY and DELTA_BYTE_ARRAY encodings. These tests are parameterized with varying row counts to test certain encoding edge cases, but the final two row counts (10,000, 50,000) are unnecessarily large to provide adequate test coverage. This PR reduces the number of row counts (some were redundant) and decreases the maximum row count to 1,000.  This drops the execution time to just under 26s on the same system.

This PR also corrects an oversight from #15239. DELTA_BYTE_ARRAY encoding should have been added to the tests at that time.

Authors:
  - Ed Seidl (https://github.com/etseidl)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15693
---
 python/cudf/cudf/tests/test_parquet.py | 55 +++++++++++++++-----------
 1 file changed, 32 insertions(+), 23 deletions(-)

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index f1b90b40991..1e175f5ff0d 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1311,8 +1311,19 @@ def test_parquet_delta_byte_array(datadir):
     assert_eq(cudf.read_parquet(fname), pd.read_parquet(fname))
 
 
+# values chosen to exercise:
+#    1 - header only, no bitpacked values
+#    2 - one bitpacked value
+#   23 - one partially filled miniblock
+#   32 - almost full miniblock
+#   33 - one full miniblock
+#   34 - one full miniblock plus one value in new miniblock
+#  128 - almost full block
+#  129 - one full block
+#  130 - one full block plus one value in new block
+# 1000 - multiple blocks
 def delta_num_rows():
-    return [1, 2, 23, 32, 33, 34, 64, 65, 66, 128, 129, 130, 20000, 50000]
+    return [1, 2, 23, 32, 33, 34, 128, 129, 130, 1000]
 
 
 @pytest.mark.parametrize("nrows", delta_num_rows())
@@ -1412,17 +1423,16 @@ def test_delta_byte_array_roundtrip(
     pcdf = cudf.from_pandas(test_pdf)
     assert_eq(cdf, pcdf)
 
-    # Test DELTA_LENGTH_BYTE_ARRAY writing as well
-    if str_encoding == "DELTA_LENGTH_BYTE_ARRAY":
-        cudf_fname = tmpdir.join("cdfdeltaba.parquet")
-        pcdf.to_parquet(
-            cudf_fname,
-            compression="snappy",
-            header_version="2.0",
-            use_dictionary=False,
-        )
-        cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname))
-        assert_eq(cdf2, cdf)
+    # Write back out with cudf and make sure pyarrow can read it
+    cudf_fname = tmpdir.join("cdfdeltaba.parquet")
+    pcdf.to_parquet(
+        cudf_fname,
+        compression="snappy",
+        header_version="2.0",
+        use_dictionary=False,
+    )
+    cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname))
+    assert_eq(cdf2, cdf)
 
 
 @pytest.mark.parametrize("nrows", delta_num_rows())
@@ -1479,17 +1489,16 @@ def string_list_gen_wrapped(x, y):
     pcdf = cudf.from_pandas(test_pdf)
     assert_eq(cdf, pcdf)
 
-    # Test DELTA_LENGTH_BYTE_ARRAY writing as well
-    if str_encoding == "DELTA_LENGTH_BYTE_ARRAY":
-        cudf_fname = tmpdir.join("cdfdeltaba.parquet")
-        pcdf.to_parquet(
-            cudf_fname,
-            compression="snappy",
-            header_version="2.0",
-            use_dictionary=False,
-        )
-        cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname))
-        assert_eq(cdf2, cdf)
+    # Write back out with cudf and make sure pyarrow can read it
+    cudf_fname = tmpdir.join("cdfdeltaba.parquet")
+    pcdf.to_parquet(
+        cudf_fname,
+        compression="snappy",
+        header_version="2.0",
+        use_dictionary=False,
+    )
+    cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname))
+    assert_eq(cdf2, cdf)
 
 
 @pytest.mark.parametrize(

From 46ae8cbc5cad97d45500901b1b15ed7c2f3eb0fc Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 7 May 2024 17:13:57 -0700
Subject: [PATCH 183/272] Fix copy assignment and the comparison operator of
 `rmm_host_allocator` (#15677)

Copy assignment of `rmm_host_allocator`, used in `hostdevice_vector`, is missing the `stream` member assignment, leading to deallocation in the default stream in the assigned-to allocator.

This PR fixes this error by switching to the auto-generated special functions.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15677
---
 .../cudf/detail/utilities/rmm_host_vector.hpp | 29 +++----------------
 1 file changed, 4 insertions(+), 25 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp b/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp
index 858501877b0..6901a19473e 100644
--- a/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp
@@ -109,30 +109,6 @@ class rmm_host_allocator {
   {
   }
 
-  /**
-   * @brief Copy constructor
-   */
-  rmm_host_allocator(rmm_host_allocator const& other) = default;
-
-  /**
-   * @brief Move constructor
-   */
-  rmm_host_allocator(rmm_host_allocator&& other) = default;
-
-  /**
-   * @brief Assignment operator
-   */
-  rmm_host_allocator& operator=(rmm_host_allocator const& other)
-  {
-    mr = other.mr;
-    return *this;
-  }
-
-  /**
-   * @brief rmm_host_allocator's null destructor does nothing.
-   */
-  inline ~rmm_host_allocator() {}
-
   /**
    * @brief This method allocates storage for objects in host memory.
    *
@@ -183,7 +159,10 @@ class rmm_host_allocator {
    *  @param x The other \p rmm_host_allocator of interest.
    *  @return This method always returns \c true.
    */
-  inline bool operator==(rmm_host_allocator const& x) const { return x.mr == mr; }
+  inline bool operator==(rmm_host_allocator const& x) const
+  {
+    return x.mr == mr && x.stream == stream;
+  }
 
   /**
    * @brief This method tests this \p rmm_host_allocator for inequality

From 5f1f0dd503ac55facfb91ae0c528b88b306831df Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Tue, 7 May 2024 20:15:50 -0700
Subject: [PATCH 184/272] Round trip FIXED_LEN_BYTE_ARRAY data properly in
 Parquet writer (#15600)

#13437 added the ability to consume FIXED_LEN_BYTE_ARRAY encoded data and represent it as lists of `UINT8`. When trying to write this data back to Parquet there are two problems. 1) the notion of fixed length is lost, and 2) the `UINT8` data is written as a list of `INT32` which can quadruple the storage required. This PR addresses both issues by adding fields to the input and output metadata to allow for preserving the form of the original data.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/15600
---
 cpp/include/cudf/io/types.hpp              | 59 ++++++++++++++-
 cpp/src/io/functions.cpp                   |  2 +
 cpp/src/io/parquet/page_enc.cu             | 32 +++++---
 cpp/src/io/parquet/parquet_gpu.hpp         |  1 +
 cpp/src/io/parquet/reader_impl.cpp         | 16 ++--
 cpp/src/io/parquet/reader_impl_helpers.cpp |  7 +-
 cpp/src/io/parquet/writer_impl.cu          | 13 +++-
 cpp/src/io/utilities/column_buffer.cpp     |  5 ++
 cpp/tests/io/parquet_writer_test.cpp       | 86 ++++++++++++++++++++++
 9 files changed, 198 insertions(+), 23 deletions(-)

diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index b3dea0ab280..150e997f533 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -236,6 +236,8 @@ enum dictionary_policy {
 struct column_name_info {
   std::string name;                        ///< Column name
   std::optional<bool> is_nullable;         ///< Column nullability
+  std::optional<bool> is_binary;           ///< Column is binary (i.e. not a list)
+  std::optional<int32_t> type_length;      ///< Byte width of data (for fixed length data)
   std::vector<column_name_info> children;  ///< Child column names
 
   /**
@@ -243,9 +245,12 @@ struct column_name_info {
    *
    * @param _name Column name
    * @param _is_nullable True if column is nullable
+   * @param _is_binary True if column is binary data
    */
-  column_name_info(std::string const& _name, std::optional<bool> _is_nullable = std::nullopt)
-    : name(_name), is_nullable(_is_nullable)
+  column_name_info(std::string const& _name,
+                   std::optional<bool> _is_nullable = std::nullopt,
+                   std::optional<bool> _is_binary   = std::nullopt)
+    : name(_name), is_nullable(_is_nullable), is_binary(_is_binary)
   {
   }
 
@@ -606,6 +611,7 @@ class column_in_metadata {
   bool _skip_compression    = false;
   std::optional<uint8_t> _decimal_precision;
   std::optional<int32_t> _parquet_field_id;
+  std::optional<int32_t> _type_length;
   std::vector<column_in_metadata> children;
   column_encoding _encoding = column_encoding::USE_DEFAULT;
 
@@ -693,6 +699,19 @@ class column_in_metadata {
     return *this;
   }
 
+  /**
+   * @brief Set the data length of the column. Only valid if this column is a
+   * fixed-length byte array.
+   *
+   * @param length The data length to set for this column
+   * @return this for chaining
+   */
+  column_in_metadata& set_type_length(int32_t length) noexcept
+  {
+    _type_length = length;
+    return *this;
+  }
+
   /**
    * @brief Set the parquet field id of this column.
    *
@@ -826,6 +845,22 @@ class column_in_metadata {
    */
   [[nodiscard]] uint8_t get_decimal_precision() const { return _decimal_precision.value(); }
 
+  /**
+   * @brief Get whether type length has been set for this column
+   *
+   * @return Boolean indicating whether type length has been set for this column
+   */
+  [[nodiscard]] bool is_type_length_set() const noexcept { return _type_length.has_value(); }
+
+  /**
+   * @brief Get the type length that was set for this column.
+   *
+   * @throws std::bad_optional_access If type length was not set for this
+   *         column. Check using `is_type_length_set()` first.
+   * @return The decimal precision that was set for this column
+   */
+  [[nodiscard]] uint8_t get_type_length() const { return _type_length.value(); }
+
   /**
    * @brief Get whether parquet field id has been set for this column.
    *
@@ -932,6 +967,7 @@ struct partition_info {
 class reader_column_schema {
   // Whether to read binary data as a string column
   bool _convert_binary_to_strings{true};
+  int32_t _type_length{0};
 
   std::vector<reader_column_schema> children;
 
@@ -997,6 +1033,18 @@ class reader_column_schema {
     return *this;
   }
 
+  /**
+   * @brief Sets the length of fixed length data.
+   *
+   * @param type_length Size of the data type in bytes
+   * @return this for chaining
+   */
+  reader_column_schema& set_type_length(int32_t type_length)
+  {
+    _type_length = type_length;
+    return *this;
+  }
+
   /**
    * @brief Get whether to encode this column as binary or string data
    *
@@ -1007,6 +1055,13 @@ class reader_column_schema {
     return _convert_binary_to_strings;
   }
 
+  /**
+   * @brief Get the length in bytes of this fixed length data.
+   *
+   * @return The length in bytes of the data type
+   */
+  [[nodiscard]] int32_t get_type_length() const { return _type_length; }
+
   /**
    * @brief Get the number of child objects
    *
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 98b010109ec..0358a1a6b86 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -592,6 +592,8 @@ table_input_metadata::table_input_metadata(table_metadata const& metadata)
     [&](column_name_info const& name) {
       auto col_meta = column_in_metadata{name.name};
       if (name.is_nullable.has_value()) { col_meta.set_nullability(name.is_nullable.value()); }
+      if (name.is_binary.value_or(false)) { col_meta.set_output_as_binary(true); }
+      if (name.type_length.has_value()) { col_meta.set_type_length(name.type_length.value()); }
       std::transform(name.children.begin(),
                      name.children.end(),
                      std::back_inserter(col_meta.children),
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 11b18579c58..e9558735929 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -109,10 +109,10 @@ using rle_page_enc_state_s = page_enc_state_s<rle_buffer_size>;
 /**
  * @brief Returns the size of the type in the Parquet file.
  */
-constexpr uint32_t physical_type_len(Type physical_type, type_id id)
+constexpr uint32_t physical_type_len(Type physical_type, type_id id, int type_length)
 {
-  if (physical_type == FIXED_LEN_BYTE_ARRAY and id == type_id::DECIMAL128) {
-    return sizeof(__int128_t);
+  if (physical_type == FIXED_LEN_BYTE_ARRAY) {
+    return id == type_id::DECIMAL128 ? sizeof(__int128_t) : type_length;
   }
   switch (physical_type) {
     case INT96: return 12u;
@@ -183,7 +183,7 @@ void __device__ calculate_frag_size(frag_init_state_s* const s, int t)
 
   auto const physical_type   = s->col.physical_type;
   auto const leaf_type       = s->col.leaf_column->type().id();
-  auto const dtype_len       = physical_type_len(physical_type, leaf_type);
+  auto const dtype_len       = physical_type_len(physical_type, leaf_type, s->col.type_length);
   auto const nvals           = s->frag.num_leaf_values;
   auto const start_value_idx = s->frag.start_value_idx;
 
@@ -541,7 +541,8 @@ __device__ size_t delta_data_len(Type physical_type,
                                  size_t page_size,
                                  encode_kernel_mask encoding)
 {
-  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  // dtype_len_out is for the lengths, rather than the char data, so pass sizeof(int32_t)
+  auto const dtype_len_out = physical_type_len(physical_type, type_id, sizeof(int32_t));
   auto const dtype_len     = [&]() -> uint32_t {
     if (physical_type == INT32) { return int32_logical_len(type_id); }
     if (physical_type == INT96) { return sizeof(int64_t); }
@@ -1662,7 +1663,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
   __syncthreads();
   auto const physical_type = s->col.physical_type;
   auto const type_id       = s->col.leaf_column->type().id();
-  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  auto const dtype_len_out = physical_type_len(physical_type, type_id, s->col.type_length);
   auto const dtype_len_in  = [&]() -> uint32_t {
     if (physical_type == INT32) { return int32_logical_len(type_id); }
     if (physical_type == INT96) { return sizeof(int64_t); }
@@ -1837,6 +1838,19 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
                            thrust::make_reverse_iterator(v_char_ptr),
                            dst + pos);
             }
+          } else {
+            auto const elem =
+              get_element<statistics::byte_array_view>(*(s->col.leaf_column), val_idx);
+            if (len != 0 and elem.data() != nullptr) {
+              if (is_split_stream) {
+                auto const v_char_ptr = reinterpret_cast<uint8_t const*>(elem.data());
+                for (int i = 0; i < dtype_len_out; i++, pos += stride) {
+                  dst[pos] = v_char_ptr[i];
+                }
+              } else {
+                memcpy(dst + pos, elem.data(), len);
+              }
+            }
           }
         } break;
       }
@@ -1884,7 +1898,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
   // Encode data values
   auto const physical_type = s->col.physical_type;
   auto const type_id       = s->col.leaf_column->type().id();
-  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  auto const dtype_len_out = physical_type_len(physical_type, type_id, s->col.type_length);
   auto const dtype_len_in  = [&]() -> uint32_t {
     if (physical_type == INT32) { return int32_logical_len(type_id); }
     if (physical_type == INT96) { return sizeof(int64_t); }
@@ -2016,7 +2030,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
   // Encode data values
   auto const physical_type = s->col.physical_type;
   auto const type_id       = s->col.leaf_column->type().id();
-  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  auto const dtype_len_out = physical_type_len(physical_type, type_id, s->col.type_length);
   auto const dtype_len_in  = [&]() -> uint32_t {
     if (physical_type == INT32) { return int32_logical_len(type_id); }
     if (physical_type == INT96) { return sizeof(int64_t); }
@@ -3218,7 +3232,7 @@ __device__ int32_t calculate_boundary_order(statistics_chunk const* s,
 }
 
 // align ptr to an 8-byte boundary. address returned will be <= ptr.
-constexpr __device__ void* align8(void* ptr)
+inline __device__ void* align8(void* ptr)
 {
   // it's ok to round down because we have an extra 7 bytes in the buffer
   auto algn = 3 & reinterpret_cast<std::uintptr_t>(ptr);
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 3b18175dccd..e3e4d8736c7 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -472,6 +472,7 @@ struct chunk_page_info {
 struct parquet_column_device_view : stats_column_desc {
   Type physical_type;            //!< physical data type
   ConvertedType converted_type;  //!< logical data type
+  int32_t type_length;           //!< length of fixed_length_byte_array data
   uint8_t level_bits;  //!< bits to encode max definition (lower nibble) & repetition (upper nibble)
                        //!< levels
   constexpr uint8_t num_def_level_bits() const { return level_bits & 0xf; }
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 0602b5ec007..3af4d5cdb86 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -510,14 +510,18 @@ table_with_metadata reader::impl::read_chunk_internal(
 
   // Create the final output cudf columns.
   for (size_t i = 0; i < _output_buffers.size(); ++i) {
-    auto metadata      = _reader_column_schema.has_value()
-                           ? std::make_optional<reader_column_schema>((*_reader_column_schema)[i])
-                           : std::nullopt;
-    auto const& schema = _metadata->get_schema(_output_column_schemas[i]);
-    // FIXED_LEN_BYTE_ARRAY never read as string
-    if (schema.type == FIXED_LEN_BYTE_ARRAY and schema.converted_type != DECIMAL) {
+    auto metadata           = _reader_column_schema.has_value()
+                                ? std::make_optional<reader_column_schema>((*_reader_column_schema)[i])
+                                : std::nullopt;
+    auto const& schema      = _metadata->get_schema(_output_column_schemas[i]);
+    auto const logical_type = schema.logical_type.value_or(LogicalType{});
+    // FIXED_LEN_BYTE_ARRAY never read as string.
+    // TODO: if we ever decide that the default reader behavior is to treat unannotated BINARY as
+    // binary and not strings, this test needs to change.
+    if (schema.type == FIXED_LEN_BYTE_ARRAY and logical_type.type != LogicalType::DECIMAL) {
       metadata = std::make_optional<reader_column_schema>();
       metadata->set_convert_binary_to_strings(false);
+      metadata->set_type_length(schema.type_length);
     }
     // Only construct `out_metadata` if `_output_metadata` has not been cached.
     if (!_output_metadata) {
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index c47beb8d7ed..68dbf532a68 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -165,9 +165,10 @@ type_id to_type_id(SchemaElement const& schema,
     case FLOAT: return type_id::FLOAT32;
     case DOUBLE: return type_id::FLOAT64;
     case BYTE_ARRAY:
-    case FIXED_LEN_BYTE_ARRAY:
-      // Can be mapped to INT32 (32-bit hash) or STRING
-      return strings_to_categorical ? type_id::INT32 : type_id::STRING;
+      // strings can be mapped to a 32-bit hash
+      if (strings_to_categorical) { return type_id::INT32; }
+      [[fallthrough]];
+    case FIXED_LEN_BYTE_ARRAY: return type_id::STRING;
     case INT96:
       return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                    : type_id::TIMESTAMP_NANOSECONDS;
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 24aa630a05f..1dfced94f5b 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -755,7 +755,14 @@ std::vector<schema_tree_node> construct_schema_tree(
         }
 
         schema_tree_node col_schema{};
-        col_schema.type            = Type::BYTE_ARRAY;
+        // test if this should be output as FIXED_LEN_BYTE_ARRAY
+        if (col_meta.is_type_length_set()) {
+          col_schema.type        = Type::FIXED_LEN_BYTE_ARRAY;
+          col_schema.type_length = col_meta.get_type_length();
+        } else {
+          col_schema.type = Type::BYTE_ARRAY;
+        }
+
         col_schema.converted_type  = thrust::nullopt;
         col_schema.stats_dtype     = statistics_dtype::dtype_byte_array;
         col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED;
@@ -1075,6 +1082,7 @@ parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream
   auto desc        = parquet_column_device_view{};  // Zero out all fields
   desc.stats_dtype = schema_node.stats_dtype;
   desc.ts_scale    = schema_node.ts_scale;
+  desc.type_length = schema_node.type_length;
 
   if (is_list()) {
     desc.level_offsets = _dremel_offsets.data();
@@ -1317,8 +1325,7 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
       chunk_col_desc.requested_encoding != column_encoding::USE_DEFAULT &&
       chunk_col_desc.requested_encoding != column_encoding::DICTIONARY;
     auto const is_type_non_dict =
-      chunk_col_desc.physical_type == Type::BOOLEAN ||
-      (chunk_col_desc.output_as_byte_array && chunk_col_desc.physical_type == Type::BYTE_ARRAY);
+      chunk_col_desc.physical_type == Type::BOOLEAN || chunk_col_desc.output_as_byte_array;
 
     if (is_type_non_dict || is_requested_non_dict) {
       chunk.use_dictionary = false;
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index db84778edc6..5ef43599838 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -188,6 +188,11 @@ std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
         if (schema_info != nullptr) {
           schema_info->children.push_back(column_name_info{"offsets"});
           schema_info->children.push_back(column_name_info{"binary"});
+          // cuDF type will be list<UINT8>, but remember it was originally binary data
+          schema_info->is_binary = true;
+          if (schema.has_value() and schema->get_type_length() > 0) {
+            schema_info->type_length = schema->get_type_length();
+          }
         }
 
         return make_lists_column(
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index fd8484bc70f..ad0860e265e 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -1872,6 +1872,92 @@ TEST_F(ParquetWriterTest, DurationByteStreamSplit)
   test_durations([](auto i) { return false; }, true);
 }
 
+TEST_F(ParquetWriterTest, WriteFixedLenByteArray)
+{
+  srand(31337);
+  using cudf::io::parquet::detail::Encoding;
+  constexpr int fixed_width          = 16;
+  constexpr cudf::size_type num_rows = 200;
+  std::vector<uint8_t> data(num_rows * fixed_width);
+  std::vector<cudf::size_type> offsets(num_rows + 1);
+
+  // fill a num_rows X fixed_width array with random numbers and populate offsets array
+  int cur_offset = 0;
+  for (int i = 0; i < num_rows; i++) {
+    offsets[i] = cur_offset;
+    for (int j = 0; j < fixed_width; j++, cur_offset++) {
+      data[cur_offset] = rand() & 0xff;
+    }
+  }
+  offsets[num_rows] = cur_offset;
+
+  auto data_child = cudf::test::fixed_width_column_wrapper<uint8_t>(data.begin(), data.end());
+  auto off_child  = cudf::test::fixed_width_column_wrapper<int32_t>(offsets.begin(), offsets.end());
+  auto col = cudf::make_lists_column(num_rows, off_child.release(), data_child.release(), 0, {});
+
+  auto expected = table_view{{*col, *col, *col, *col}};
+  cudf::io::table_input_metadata expected_metadata(expected);
+
+  expected_metadata.column_metadata[0]
+    .set_name("flba_plain")
+    .set_type_length(fixed_width)
+    .set_encoding(cudf::io::column_encoding::PLAIN)
+    .set_output_as_binary(true);
+  expected_metadata.column_metadata[1]
+    .set_name("flba_split")
+    .set_type_length(fixed_width)
+    .set_encoding(cudf::io::column_encoding::BYTE_STREAM_SPLIT)
+    .set_output_as_binary(true);
+  expected_metadata.column_metadata[2]
+    .set_name("flba_delta")
+    .set_type_length(fixed_width)
+    .set_encoding(cudf::io::column_encoding::DELTA_BYTE_ARRAY)
+    .set_output_as_binary(true);
+  expected_metadata.column_metadata[3]
+    .set_name("flba_dict")
+    .set_type_length(fixed_width)
+    .set_encoding(cudf::io::column_encoding::DICTIONARY)
+    .set_output_as_binary(true);
+
+  auto filepath = temp_env->get_temp_filepath("WriteFixedLenByteArray.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(expected_metadata);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+
+  // check page headers to make sure each column is encoded with the appropriate encoder
+  auto const source = cudf::io::datasource::create(filepath);
+  cudf::io::parquet::detail::FileMetaData fmd;
+  read_footer(source, &fmd);
+
+  // check that the schema retains the FIXED_LEN_BYTE_ARRAY type
+  for (int i = 1; i <= 4; i++) {
+    EXPECT_EQ(fmd.schema[i].type, cudf::io::parquet::detail::Type::FIXED_LEN_BYTE_ARRAY);
+    EXPECT_EQ(fmd.schema[i].type_length, fixed_width);
+  }
+
+  // no nulls and no repetition, so the only encoding used should be for the data.
+  auto const expect_enc = [&fmd](int idx, cudf::io::parquet::detail::Encoding enc) {
+    EXPECT_EQ(fmd.row_groups[0].columns[idx].meta_data.encodings[0], enc);
+  };
+
+  // requested plain
+  expect_enc(0, Encoding::PLAIN);
+  // requested byte_stream_split
+  expect_enc(1, Encoding::BYTE_STREAM_SPLIT);
+  // requested delta_byte_array
+  expect_enc(2, Encoding::DELTA_BYTE_ARRAY);
+  // requested dictionary, but should fall back to plain
+  // TODO: update if we get FLBA working with dictionary encoding
+  expect_enc(3, Encoding::PLAIN);
+}
+
 /////////////////////////////////////////////////////////////
 // custom mem mapped data sink that supports device writes
 template <bool supports_device_writes>

From 2056d0fa16bbea9fef9bb9e10558967680b14a3b Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 8 May 2024 11:19:35 -0400
Subject: [PATCH 185/272] Use experimental make_strings_children for
 multi-replace_re (#15667)

Updates multi-pattern version of `cudf::strings::replace_re` to use the new experimental `make_strings_children` which supports building large strings.

Reference https://github.com/rapidsai/cudf/issues/15579

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/15667
---
 cpp/src/strings/replace/multi_re.cu | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 5172dba3fc3..b9a3acf747f 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -23,7 +23,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/replace_re.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -56,13 +56,14 @@ struct replace_multi_regex_fn {
   device_span<reprog_device const> progs;  // array of regex progs
   found_range* d_found_ranges;             // working array matched (begin,end) values
   column_device_view const d_repls;        // replacement strings
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -129,7 +130,7 @@ struct replace_multi_regex_fn {
                      d_str.size_bytes() - last_pos.byte_offset(),
                      out_ptr);
     } else {
-      d_offsets[idx] = nbytes;
+      d_sizes[idx] = nbytes;
     }
   }
 };
@@ -186,7 +187,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
 
   auto found_ranges = rmm::device_uvector<found_range>(d_progs.size() * input.size(), stream);
 
-  auto [offsets_column, chars] = make_strings_children(
+  auto [offsets_column, chars] = experimental::make_strings_children(
     replace_multi_regex_fn{*d_strings, d_progs, found_ranges.data(), *d_repls},
     input.size(),
     stream,

From ab73b4cd14f39fbc64f6b0b0ab625e78715acd2d Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 8 May 2024 12:06:13 -0500
Subject: [PATCH 186/272] Make `nan_as_null` behavior consistent across all
 APIs (#15692)

Fixes: #15679

This PR switches the default of `nan_as_null` to be `False` if pandas compatibility mode is turned on.

Forked from #14534

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15692
---
 python/cudf/cudf/core/dataframe.py       | 13 +++++++++++--
 python/cudf/cudf/core/index.py           |  6 +++++-
 python/cudf/cudf/core/series.py          |  4 +++-
 python/cudf/cudf/tests/test_dataframe.py |  8 ++++++++
 python/cudf/cudf/tests/test_index.py     |  8 ++++++++
 python/cudf/cudf/tests/test_series.py    |  8 ++++++++
 6 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 6928425a867..b937d2da25c 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -684,9 +684,16 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
 
     @_cudf_nvtx_annotate
     def __init__(
-        self, data=None, index=None, columns=None, dtype=None, nan_as_null=True
+        self,
+        data=None,
+        index=None,
+        columns=None,
+        dtype=None,
+        nan_as_null=no_default,
     ):
         super().__init__()
+        if nan_as_null is no_default:
+            nan_as_null = not cudf.get_option("mode.pandas_compatible")
 
         if isinstance(columns, (Series, cudf.BaseIndex)):
             columns = columns.to_pandas()
@@ -3185,7 +3192,7 @@ def reset_index(
         )
 
     @_cudf_nvtx_annotate
-    def insert(self, loc, name, value, nan_as_null=None):
+    def insert(self, loc, name, value, nan_as_null=no_default):
         """Add a column to DataFrame at the index specified by loc.
 
         Parameters
@@ -3200,6 +3207,8 @@ def insert(self, loc, name, value, nan_as_null=None):
             ``null`` values.
             If ``False``, leaves ``np.nan`` values as is.
         """
+        if nan_as_null is no_default:
+            nan_as_null = not cudf.get_option("mode.pandas_compatible")
         return self._insert(
             loc=loc,
             name=name,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 52322b0160f..35afe6ee949 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2964,7 +2964,7 @@ def _clean_nulls_from_index(self):
 
 @_cudf_nvtx_annotate
 def as_index(
-    arbitrary, nan_as_null=None, copy=False, name=no_default, dtype=None
+    arbitrary, nan_as_null=no_default, copy=False, name=no_default, dtype=None
 ) -> BaseIndex:
     """Create an Index from an arbitrary object
 
@@ -3014,6 +3014,10 @@ def as_index(
         - DatetimeIndex for Datetime input.
         - Index for all other inputs.
     """
+    if nan_as_null is no_default:
+        nan_as_null = (
+            False if cudf.get_option("mode.pandas_compatible") else None
+        )
 
     if name is no_default:
         name = getattr(arbitrary, "name", None)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 63a49a898f4..c7bc97edd68 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -581,8 +581,10 @@ def __init__(
         dtype=None,
         name=None,
         copy=False,
-        nan_as_null=True,
+        nan_as_null=no_default,
     ):
+        if nan_as_null is no_default:
+            nan_as_null = not cudf.get_option("mode.pandas_compatible")
         index_from_data = None
         name_from_data = None
         if data is None:
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index f52076407b5..2dee3566e1b 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -11006,3 +11006,11 @@ def test_op_preserves_column_metadata(column, operation):
     result = operation(df).columns
     expected = pd.Index(column)
     pd.testing.assert_index_equal(result, expected, exact=True)
+
+
+def test_dataframe_init_with_nans():
+    with cudf.option_context("mode.pandas_compatible", True):
+        gdf = cudf.DataFrame({"a": [1, 2, 3, np.nan]})
+    assert gdf["a"].dtype == np.dtype("float64")
+    pdf = pd.DataFrame({"a": [1, 2, 3, np.nan]})
+    assert_eq(pdf, gdf)
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index baa839ecd72..0b252cec4b8 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -3249,3 +3249,11 @@ def test_index_contains_float_int(data, dtype, needle):
     expected = needle in pidx
 
     assert_eq(actual, expected)
+
+
+def test_Index_init_with_nans():
+    with cudf.option_context("mode.pandas_compatible", True):
+        gi = cudf.Index([1, 2, 3, np.nan])
+    assert gi.dtype == np.dtype("float64")
+    pi = pd.Index([1, 2, 3, np.nan])
+    assert_eq(pi, gi)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 6a9de197374..08a6173d3f5 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2788,6 +2788,14 @@ def test_squeeze_invalid_axis(axis):
         cudf.Series([1]).squeeze(axis=axis)
 
 
+def test_series_init_with_nans():
+    with cudf.option_context("mode.pandas_compatible", True):
+        gs = cudf.Series([1, 2, 3, np.nan])
+    assert gs.dtype == np.dtype("float64")
+    ps = pd.Series([1, 2, 3, np.nan])
+    assert_eq(ps, gs)
+
+
 @pytest.mark.parametrize("data", [None, 123, 33243243232423, 0])
 def test_timestamp_series_init(data):
     scalar = pd.Timestamp(data)

From 6d0f3d9a6fc0d079d598e08cac824b7b3c6cbc9b Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 8 May 2024 10:20:35 -0700
Subject: [PATCH 187/272] Add new patch to hide more CCCL APIs (#15493)

See rapidsai/rapids-cmake#580

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/15493
---
 cpp/cmake/Modules/ConfigureCUDA.cmake           | 7 ++++++-
 cpp/cmake/thirdparty/patches/cccl_override.json | 5 +++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/cpp/cmake/Modules/ConfigureCUDA.cmake b/cpp/cmake/Modules/ConfigureCUDA.cmake
index f79e4c37228..f75b5aef7af 100644
--- a/cpp/cmake/Modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/Modules/ConfigureCUDA.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -25,6 +25,11 @@ else()
   list(APPEND CUDF_CUDA_FLAGS -Werror=cross-execution-space-call)
 endif()
 list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations)
+# This warning needs to be suppressed because some parts of cudf instantiate templated CCCL
+# functions in contexts where the resulting instantiations would have internal linkage (e.g. in
+# anonymous namespaces). In such contexts, the visibility attribute on the template is ignored, and
+# the compiler issues a warning. This is not a problem and will be fixed in future versions of CCCL.
+list(APPEND CUDF_CUDA_FLAGS -diag-suppress=1407)
 
 if(DISABLE_DEPRECATION_WARNINGS)
   list(APPEND CUDF_CXX_FLAGS -Wno-deprecated-declarations)
diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json
index 68fc8979c46..b33f17f3e4a 100644
--- a/cpp/cmake/thirdparty/patches/cccl_override.json
+++ b/cpp/cmake/thirdparty/patches/cccl_override.json
@@ -18,6 +18,11 @@
           "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
           "fixed_in" : ""
         },
+        {
+          "file": "cccl/kernel_pointer_hiding.diff",
+          "issue": "Hide APIs that accept kernel pointers [https://github.com/NVIDIA/cccl/pull/1395]",
+          "fixed_in": "2.4"
+        },
         {
           "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff",
           "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",

From eaf555616ff83a75b3c3b11ce18e1c393604ccf4 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 8 May 2024 13:30:14 -0500
Subject: [PATCH 188/272] Properly implement binaryops for proxy types (#15684)

Fixes #15675
This PR makes changes to `cudf.pandas` machinery by not calling `operator.op` functions insider the re-direct calls.

Forked from #14534

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15684
---
 python/cudf/cudf/pandas/fast_slow_proxy.py    | 170 ++++++------------
 .../cudf_pandas_tests/test_cudf_pandas.py     |  13 ++
 .../cudf_pandas_tests/test_fast_slow_proxy.py |   4 -
 .../cudf/cudf_pandas_tests/test_profiler.py   |   1 +
 4 files changed, 71 insertions(+), 117 deletions(-)

diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index 835cfa89133..c66458077fa 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -597,90 +597,6 @@ def __setattr__(self, name, value):
             return
         return _FastSlowAttribute("__setattr__").__get__(self)(name, value)
 
-    def __add__(self, other):
-        return _fast_slow_function_call(operator.add, self, other)[0]
-
-    def __radd__(self, other):
-        return _fast_slow_function_call(operator.add, other, self)[0]
-
-    def __sub__(self, other):
-        return _fast_slow_function_call(operator.sub, self, other)[0]
-
-    def __rsub__(self, other):
-        return _fast_slow_function_call(operator.sub, other, self)[0]
-
-    def __mul__(self, other):
-        return _fast_slow_function_call(operator.mul, self, other)[0]
-
-    def __rmul__(self, other):
-        return _fast_slow_function_call(operator.mul, other, self)[0]
-
-    def __truediv__(self, other):
-        return _fast_slow_function_call(operator.truediv, self, other)[0]
-
-    def __rtruediv__(self, other):
-        return _fast_slow_function_call(operator.truediv, other, self)[0]
-
-    def __floordiv__(self, other):
-        return _fast_slow_function_call(operator.floordiv, self, other)[0]
-
-    def __rfloordiv__(self, other):
-        return _fast_slow_function_call(operator.floordiv, other, self)[0]
-
-    def __mod__(self, other):
-        return _fast_slow_function_call(operator.mod, self, other)[0]
-
-    def __rmod__(self, other):
-        return _fast_slow_function_call(operator.mod, other, self)[0]
-
-    def __divmod__(self, other):
-        return _fast_slow_function_call(divmod, self, other)[0]
-
-    def __rdivmod__(self, other):
-        return _fast_slow_function_call(divmod, other, self)[0]
-
-    def __pow__(self, other):
-        return _fast_slow_function_call(operator.pow, self, other)[0]
-
-    def __rpow__(self, other):
-        return _fast_slow_function_call(operator.pow, other, self)[0]
-
-    def __lshift__(self, other):
-        return _fast_slow_function_call(operator.lshift, self, other)[0]
-
-    def __rlshift__(self, other):
-        return _fast_slow_function_call(operator.lshift, other, self)[0]
-
-    def __rshift__(self, other):
-        return _fast_slow_function_call(operator.rshift, self, other)[0]
-
-    def __rrshift__(self, other):
-        return _fast_slow_function_call(operator.rshift, other, self)[0]
-
-    def __and__(self, other):
-        return _fast_slow_function_call(operator.and_, self, other)[0]
-
-    def __rand__(self, other):
-        return _fast_slow_function_call(operator.and_, other, self)[0]
-
-    def __xor__(self, other):
-        return _fast_slow_function_call(operator.xor, self, other)[0]
-
-    def __rxor__(self, other):
-        return _fast_slow_function_call(operator.xor, other, self)[0]
-
-    def __or__(self, other):
-        return _fast_slow_function_call(operator.or_, self, other)[0]
-
-    def __ror__(self, other):
-        return _fast_slow_function_call(operator.or_, other, self)[0]
-
-    def __matmul__(self, other):
-        return _fast_slow_function_call(operator.matmul, self, other)[0]
-
-    def __rmatmul__(self, other):
-        return _fast_slow_function_call(operator.matmul, other, self)[0]
-
 
 class _FinalProxy(_FastSlowProxy):
     """
@@ -1141,41 +1057,69 @@ def _replace_closurevars(
 
 
 _SPECIAL_METHODS: Set[str] = {
-    "__repr__",
-    "__str__",
-    "__len__",
-    "__contains__",
-    "__getitem__",
-    "__setitem__",
-    "__delitem__",
-    "__getslice__",
-    "__setslice__",
-    "__delslice__",
-    "__iter__",
-    "__lt__",
-    "__le__",
-    "__eq__",
-    "__ne__",
-    "__gt__",
-    "__ge__",
-    "__pos__",
-    "__neg__",
-    "__invert__",
     "__abs__",
-    "__round__",
-    "__format__",
+    "__add__",
+    "__and__",
     "__bool__",
-    "__float__",
-    "__int__",
+    "__call__",
     "__complex__",
-    "__enter__",
-    "__exit__",
-    "__next__",
+    "__contains__",
     "__copy__",
-    "__deepcopy__",
     "__dataframe__",
-    "__call__",
+    "__deepcopy__",
+    "__delitem__",
+    "__delslice__",
+    "__divmod__",
+    "__enter__",
+    "__eq__",
+    "__exit__",
+    "__float__",
+    "__floordiv__",
+    "__format__",
+    "__ge__",
+    "__getitem__",
+    "__getslice__",
+    "__gt__",
     # Added on a per-proxy basis
     # https://github.com/rapidsai/xdf/pull/306#pullrequestreview-1636155428
     # "__hash__",
+    "__int__",
+    "__invert__",
+    "__iter__",
+    "__le__",
+    "__len__",
+    "__lshift__",
+    "__lt__",
+    "__matmul__",
+    "__mod__",
+    "__mul__",
+    "__ne__",
+    "__neg__",
+    "__next__",
+    "__or__",
+    "__pos__",
+    "__pow__",
+    "__radd__",
+    "__rand__",
+    "__rdivmod__",
+    "__repr__",
+    "__rfloordiv__",
+    "__rlshift__",
+    "__rmatmul__",
+    "__rmod__",
+    "__rmul__",
+    "__ror__",
+    "__round__",
+    "__rpow__",
+    "__rrshift__",
+    "__rshift__",
+    "__rsub__",
+    "__rtruediv__",
+    "__rxor__",
+    "__setitem__",
+    "__setslice__",
+    "__str__",
+    "__sub__",
+    "__truediv__",
+    "__xor__",
 }
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 8d319cfe640..aa937d3ed4f 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1218,6 +1218,19 @@ def test_isinstance_base_offset():
     assert isinstance(offset, xpd.tseries.offsets.BaseOffset)
 
 
+def test_floordiv_array_vs_df():
+    xarray = xpd.Series([1, 2, 3], dtype="datetime64[ns]").array
+    parray = pd.Series([1, 2, 3], dtype="datetime64[ns]").array
+
+    xdf = xpd.DataFrame(xarray)
+    pdf = pd.DataFrame(parray)
+
+    actual = xarray.__floordiv__(xdf)
+    expected = parray.__floordiv__(pdf)
+
+    tm.assert_equal(actual, expected)
+
+
 def test_apply_slow_path_udf_references_global_module():
     def my_apply(df, unused):
         # `datetime` Raised `KeyError: __import__`
diff --git a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
index 631ad2f37b2..39bf07c49de 100644
--- a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
+++ b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
@@ -439,10 +439,6 @@ def __radd__(self, other):
     assert Bar() + Foo() == "sum"
     assert FooProxy() + BarProxy() == "sum"
     assert BarProxy() + FooProxy() == "sum"
-    assert FooProxy() + Bar() == "sum"
-    assert Bar() + FooProxy() == "sum"
-    assert Foo() + BarProxy() == "sum"
-    assert BarProxy() + Foo() == "sum"
 
 
 def test_slow_attr_still_proxy():
diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py
index dd8d9287972..359a2a2c515 100644
--- a/python/cudf/cudf_pandas_tests/test_profiler.py
+++ b/python/cudf/cudf_pandas_tests/test_profiler.py
@@ -37,6 +37,7 @@ def test_profiler():
         "DataFrame.sum",
         "Series.__getitem__",
         "Timedelta",
+        "Timestamp.__add__",
     }
     for name, func in per_function_stats.items():
         assert (

From ffbdd2402d1131b9a06cda87b4ef888953b2901a Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Wed, 8 May 2024 12:52:16 -0700
Subject: [PATCH 189/272] Skip decode steps in Parquet reader when nullable
 columns have no nulls (#15332)

Closes #15266.

Some block and warp operations and some atomics can be avoided during Parquet page decoding when nullable columns are known to not contain any null values. One issue, however, is that since the columns are nullable, the null mask has to be set. My approach in this PR was to initialize nullable column buffers with an `ALL_VALID` mask rather than `ALL_NULL`. This will work when nulls are present because `store_validity()` sets the bitmask to all zeros before ORing in the passed `valid_mask`.

Benchmarks modified to not emit nulls showed a good improvement in decoding time for fixed-width data types.
```
## [0] NVIDIA RTX A6000

|  data_type  |    io_type    |  cardinality  |  run_length  |   Ref Time |   Ref Noise |   Cmp Time |   Cmp Noise |         Diff |   %Diff |  Status  |
|-------------|---------------|---------------|--------------|------------|-------------|------------|-------------|--------------|---------|----------|
|  INTEGRAL   | DEVICE_BUFFER |       0       |      1       |   9.955 ms |       2.31% |   9.361 ms |       3.55% |  -594.395 us |  -5.97% |   FAIL   |
|  INTEGRAL   | DEVICE_BUFFER |     1000      |      1       |   9.964 ms |       3.01% |   8.981 ms |       3.98% |  -982.965 us |  -9.87% |   FAIL   |
|  INTEGRAL   | DEVICE_BUFFER |       0       |      32      |  10.222 ms |       3.32% |   9.207 ms |       5.47% | -1014.597 us |  -9.93% |   FAIL   |
|  INTEGRAL   | DEVICE_BUFFER |     1000      |      32      |   9.930 ms |       3.83% |   8.607 ms |       3.37% | -1323.326 us | -13.33% |   FAIL   |
|    FLOAT    | DEVICE_BUFFER |       0       |      1       |   5.999 ms |       3.59% |   5.752 ms |       3.87% |  -246.635 us |  -4.11% |   FAIL   |
|    FLOAT    | DEVICE_BUFFER |     1000      |      1       |   6.576 ms |       4.40% |   5.839 ms |       4.43% |  -737.338 us | -11.21% |   FAIL   |
|    FLOAT    | DEVICE_BUFFER |       0       |      32      |   5.828 ms |       4.59% |   4.940 ms |       4.41% |  -887.375 us | -15.23% |   FAIL   |
|    FLOAT    | DEVICE_BUFFER |     1000      |      32      |   6.198 ms |       3.91% |   5.271 ms |       3.54% |  -927.602 us | -14.97% |   FAIL   |
|   DECIMAL   | DEVICE_BUFFER |       0       |      1       |  20.199 ms |       1.66% |  20.014 ms |       2.23% |  -184.710 us |  -0.91% |   PASS   |
|   DECIMAL   | DEVICE_BUFFER |     1000      |      1       |   7.068 ms |       3.99% |   6.479 ms |       4.08% |  -588.856 us |  -8.33% |   FAIL   |
|   DECIMAL   | DEVICE_BUFFER |       0       |      32      |   9.287 ms |       3.45% |   8.656 ms |       2.94% |  -631.348 us |  -6.80% |   FAIL   |
|   DECIMAL   | DEVICE_BUFFER |     1000      |      32      |   5.641 ms |       4.39% |   5.021 ms |       3.31% |  -620.122 us | -10.99% |   FAIL   |
|  TIMESTAMP  | DEVICE_BUFFER |       0       |      1       |  27.488 ms |       1.57% |  27.235 ms |       1.74% |  -253.277 us |  -0.92% |   PASS   |
|  TIMESTAMP  | DEVICE_BUFFER |     1000      |      1       |   6.656 ms |       4.73% |   6.049 ms |       4.61% |  -607.760 us |  -9.13% |   FAIL   |
|  TIMESTAMP  | DEVICE_BUFFER |       0       |      32      |   9.974 ms |       3.22% |   9.204 ms |       2.84% |  -770.247 us |  -7.72% |   FAIL   |
|  TIMESTAMP  | DEVICE_BUFFER |     1000      |      32      |   5.998 ms |       5.17% |   5.203 ms |       3.06% |  -794.943 us | -13.25% |   FAIL   |
|  DURATION   | DEVICE_BUFFER |       0       |      1       |   8.816 ms |       3.61% |   8.538 ms |       3.26% |  -278.877 us |  -3.16% |   PASS   |
|  DURATION   | DEVICE_BUFFER |     1000      |      1       |   5.989 ms |       4.76% |   5.446 ms |       4.57% |  -542.636 us |  -9.06% |   FAIL   |
|  DURATION   | DEVICE_BUFFER |       0       |      32      |   6.822 ms |       4.96% |   6.042 ms |       3.74% |  -779.786 us | -11.43% |   FAIL   |
|  DURATION   | DEVICE_BUFFER |     1000      |      32      |   5.706 ms |       5.40% |   4.930 ms |       3.39% |  -775.607 us | -13.59% |   FAIL   |
|   STRING    | DEVICE_BUFFER |       0       |      1       |  36.616 ms |       1.74% |  36.483 ms |       1.31% |  -132.191 us |  -0.36% |   PASS   |
|   STRING    | DEVICE_BUFFER |     1000      |      1       |  12.006 ms |       4.15% |  11.989 ms |       3.53% |   -16.278 us |  -0.14% |   PASS   |
|   STRING    | DEVICE_BUFFER |       0       |      32      |  36.587 ms |       1.99% |  36.514 ms |       1.38% |   -73.737 us |  -0.20% |   PASS   |
|   STRING    | DEVICE_BUFFER |     1000      |      32      |  11.235 ms |       4.25% |  11.228 ms |       3.62% |    -7.041 us |  -0.06% |   PASS   |
|    LIST     | DEVICE_BUFFER |       0       |      1       |  36.929 ms |       1.88% |  36.988 ms |       1.42% |    59.350 us |   0.16% |   PASS   |
|    LIST     | DEVICE_BUFFER |     1000      |      1       |  36.510 ms |       1.91% |  36.558 ms |       1.66% |    48.536 us |   0.13% |   PASS   |
|    LIST     | DEVICE_BUFFER |       0       |      32      |  35.513 ms |       2.00% |  35.490 ms |       1.77% |   -23.411 us |  -0.07% |   PASS   |
|    LIST     | DEVICE_BUFFER |     1000      |      32      |  35.755 ms |       1.99% |  35.728 ms |       1.64% |   -27.564 us |  -0.08% |   PASS   |
|   STRUCT    | DEVICE_BUFFER |       0       |      1       |  43.456 ms |       1.35% |  43.537 ms |       1.16% |    81.405 us |   0.19% |   PASS   |
|   STRUCT    | DEVICE_BUFFER |     1000      |      1       |  25.549 ms |       2.54% |  25.698 ms |       1.90% |   149.295 us |   0.58% |   PASS   |
|   STRUCT    | DEVICE_BUFFER |       0       |      32      |  43.103 ms |       1.87% |  43.019 ms |       1.59% |   -84.825 us |  -0.20% |   PASS   |
|   STRUCT    | DEVICE_BUFFER |     1000      |      32      |  23.462 ms |       2.81% |  23.432 ms |       1.88% |   -30.434 us |  -0.13% |   PASS   |
```

Authors:
  - Ed Seidl (https://github.com/etseidl)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15332
---
 cpp/src/io/parquet/decode_fixed.cu           | 72 +++++++-------------
 cpp/src/io/parquet/reader_impl_preprocess.cu |  7 +-
 cpp/src/io/utilities/column_buffer.cpp       | 19 ++++--
 cpp/src/io/utilities/column_buffer.hpp       |  7 ++
 4 files changed, 52 insertions(+), 53 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index f3332a23992..bfd89200786 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -31,12 +31,8 @@ constexpr int rolling_buf_size  = decode_block_size * 2;
 constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size<decode_block_size>();
 
 template <bool nullable, typename level_t, typename state_buf>
-static __device__ int gpuUpdateValidityOffsetsAndRowIndicesFlat(int32_t target_value_count,
-                                                                page_state_s* s,
-                                                                state_buf* sb,
-                                                                level_t const* const def,
-                                                                int t,
-                                                                bool nullable_with_nulls)
+static __device__ int gpuUpdateValidityOffsetsAndRowIndicesFlat(
+  int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
 {
   constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
   constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
@@ -63,13 +59,9 @@ static __device__ int gpuUpdateValidityOffsetsAndRowIndicesFlat(int32_t target_v
     // definition level. only need to process for nullable columns
     int d = 0;
     if constexpr (nullable) {
-      if (nullable_with_nulls) {
-        d = t < batch_size
-              ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
-              : -1;
-      } else {
-        d = t < batch_size ? 1 : -1;
-      }
+      d = t < batch_size
+            ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
+            : -1;
     }
 
     int const thread_value_count = t + 1;
@@ -426,17 +418,13 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   while (s->error == 0 && processed_count < s->page.num_input_values) {
     int next_valid_count;
 
-    // only need to process definition levels if this is a nullable column
-    if (nullable) {
-      if (nullable_with_nulls) {
-        processed_count += def_decoder.decode_next(t);
-        __syncthreads();
-      } else {
-        processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-      }
+    // only need to process definition levels if the column has nulls
+    if (nullable_with_nulls) {
+      processed_count += def_decoder.decode_next(t);
+      __syncthreads();
 
-      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(
-        processed_count, s, sb, def, t, nullable_with_nulls);
+      next_valid_count =
+        gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(processed_count, s, sb, def, t);
     }
     // if we wanted to split off the skip_rows/num_rows case into a separate kernel, we could skip
     // this function call entirely since all it will ever generate is a mapping of (i -> i) for
@@ -444,7 +432,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     else {
       processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
       next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<false, level_t>(
-        processed_count, s, sb, nullptr, t, false);
+        processed_count, s, sb, nullptr, t);
     }
     __syncthreads();
 
@@ -547,18 +535,14 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   while (s->error == 0 && processed_count < s->page.num_input_values) {
     int next_valid_count;
 
-    // only need to process definition levels if this is a nullable column
-    if (nullable) {
-      if (nullable_with_nulls) {
-        processed_count += def_decoder.decode_next(t);
-        __syncthreads();
-      } else {
-        processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-      }
+    // only need to process definition levels if the column has nulls
+    if (nullable_with_nulls) {
+      processed_count += def_decoder.decode_next(t);
+      __syncthreads();
 
       // count of valid items in this batch
-      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(
-        processed_count, s, sb, def, t, nullable_with_nulls);
+      next_valid_count =
+        gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(processed_count, s, sb, def, t);
     }
     // if we wanted to split off the skip_rows/num_rows case into a separate kernel, we could skip
     // this function call entirely since all it will ever generate is a mapping of (i -> i) for
@@ -566,7 +550,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     else {
       processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
       next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<false, level_t>(
-        processed_count, s, sb, nullptr, t, false);
+        processed_count, s, sb, nullptr, t);
     }
     __syncthreads();
 
@@ -671,17 +655,13 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   while (s->error == 0 && processed_count < s->page.num_input_values) {
     int next_valid_count;
 
-    // only need to process definition levels if this is a nullable column
-    if (nullable) {
-      if (nullable_with_nulls) {
-        processed_count += def_decoder.decode_next(t);
-        __syncthreads();
-      } else {
-        processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-      }
+    // only need to process definition levels if the column has nulls
+    if (nullable_with_nulls) {
+      processed_count += def_decoder.decode_next(t);
+      __syncthreads();
 
-      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(
-        processed_count, s, sb, def, t, nullable_with_nulls);
+      next_valid_count =
+        gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(processed_count, s, sb, def, t);
     }
     // if we wanted to split off the skip_rows/num_rows case into a separate kernel, we could skip
     // this function call entirely since all it will ever generate is a mapping of (i -> i) for
@@ -689,7 +669,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     else {
       processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
       next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<false, level_t>(
-        processed_count, s, sb, nullptr, t, false);
+        processed_count, s, sb, nullptr, t);
     }
     __syncthreads();
 
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 55633b97cf4..a5cd7d06536 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1498,8 +1498,10 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
       // if we haven't already processed this column because it is part of a struct hierarchy
       else if (out_buf.size == 0) {
         // add 1 for the offset if this is a list column
-        out_buf.create(
+        // we're going to start null mask as all valid and then turn bits off if necessary
+        out_buf.create_with_mask(
           out_buf.type.id() == type_id::LIST && l_idx < max_depth ? num_rows + 1 : num_rows,
+          cudf::mask_state::ALL_VALID,
           _stream,
           _mr);
       }
@@ -1577,7 +1579,8 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
           if (out_buf.type.id() == type_id::LIST && l_idx < max_depth) { size++; }
 
           // allocate
-          out_buf.create(size, _stream, _mr);
+          // we're going to start null mask as all valid and then turn bits off if necessary
+          out_buf.create_with_mask(size, cudf::mask_state::ALL_VALID, _stream, _mr);
         }
       }
     }
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 5ef43599838..e5d4e1a360f 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -91,9 +91,10 @@ void copy_buffer_data(string_policy const& buff, string_policy& new_buff)
 }  // namespace
 
 template <class string_policy>
-void column_buffer_base<string_policy>::create(size_type _size,
-                                               rmm::cuda_stream_view stream,
-                                               rmm::device_async_resource_ref mr)
+void column_buffer_base<string_policy>::create_with_mask(size_type _size,
+                                                         cudf::mask_state null_mask_state,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::device_async_resource_ref mr)
 {
   size = _size;
   _mr  = mr;
@@ -111,11 +112,19 @@ void column_buffer_base<string_policy>::create(size_type _size,
     default: _data = create_data(type, size, stream, _mr); break;
   }
   if (is_nullable) {
-    _null_mask = cudf::detail::create_null_mask(
-      size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), _mr);
+    _null_mask =
+      cudf::detail::create_null_mask(size, null_mask_state, rmm::cuda_stream_view(stream), _mr);
   }
 }
 
+template <class string_policy>
+void column_buffer_base<string_policy>::create(size_type _size,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  create_with_mask(_size, mask_state::ALL_NULL, stream, mr);
+}
+
 template <class string_policy>
 string_policy column_buffer_base<string_policy>::empty_like(string_policy const& input)
 {
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index ace1396bc09..e6bfae0681a 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -115,6 +115,13 @@ class column_buffer_base {
   // preprocessing steps such as in the Parquet reader
   void create(size_type _size, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
+  // like create(), but also takes a `cudf::mask_state` to allow initializing the null mask as
+  // something other than `ALL_NULL`
+  void create_with_mask(size_type _size,
+                        cudf::mask_state null_mask_state,
+                        rmm::cuda_stream_view stream,
+                        rmm::device_async_resource_ref mr);
+
   // Create a new column_buffer that has empty data but with the same basic information as the
   // input column, including same type, nullability, name, and user_data.
   static string_policy empty_like(string_policy const& input);

From f965f3cc90de3fd693cf41954b3eecefa24512d7 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Wed, 8 May 2024 16:24:20 -0400
Subject: [PATCH 190/272] Reducing runtime of JSON reader options benchmark
 (#15681)

This PR cleans up the JSON reader options benchmark by reducing the number of runtime configurations from 162 to 20.
Reasoning behind the splitting of the benchmark -
1. The `normalize_single_quotes` and `normalize_whitespace` are pre-processing options and do not impact each other - the runtimes of the FSTs are additive.
2. The performance of raw input ingestion (`row_selection::ALL` and `row_selection::BYTE_RANGE`) is independent of the token generation and tree algorithms.

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/15681
---
 cpp/benchmarks/io/json/json_reader_option.cpp | 61 ++++++++++++++++---
 1 file changed, 54 insertions(+), 7 deletions(-)

diff --git a/cpp/benchmarks/io/json/json_reader_option.cpp b/cpp/benchmarks/io/json/json_reader_option.cpp
index ed1008d053a..378134a2010 100644
--- a/cpp/benchmarks/io/json/json_reader_option.cpp
+++ b/cpp/benchmarks/io/json/json_reader_option.cpp
@@ -173,15 +173,62 @@ void BM_jsonlines_read_options(nvbench::state& state,
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
 }
 
+NVBENCH_BENCH_TYPES(BM_jsonlines_read_options,
+                    NVBENCH_TYPE_AXES(nvbench::enum_type_list<row_selection::ALL>,
+                                      nvbench::enum_type_list<normalize_single_quotes::NO,
+                                                              normalize_single_quotes::YES>,
+                                      nvbench::enum_type_list<normalize_whitespace::NO>,
+                                      nvbench::enum_type_list<mixed_types_as_string::NO>,
+                                      nvbench::enum_type_list<recovery_mode::FAIL>))
+  .set_name("jsonlines_reader_normalize_single_quotes")
+  .set_type_axes_names({"row_selection",
+                        "normalize_single_quotes",
+                        "normalize_whitespace",
+                        "mixed_types_as_string",
+                        "recovery_mode"})
+  .set_min_samples(6)
+  .add_int64_axis("num_chunks", nvbench::range(1, 1, 1));
+
+NVBENCH_BENCH_TYPES(
+  BM_jsonlines_read_options,
+  NVBENCH_TYPE_AXES(nvbench::enum_type_list<row_selection::ALL>,
+                    nvbench::enum_type_list<normalize_single_quotes::NO>,
+                    nvbench::enum_type_list<normalize_whitespace::NO, normalize_whitespace::YES>,
+                    nvbench::enum_type_list<mixed_types_as_string::NO>,
+                    nvbench::enum_type_list<recovery_mode::FAIL>))
+  .set_name("jsonlines_reader_normalize_whitespace")
+  .set_type_axes_names({"row_selection",
+                        "normalize_single_quotes",
+                        "normalize_whitespace",
+                        "mixed_types_as_string",
+                        "recovery_mode"})
+  .set_min_samples(6)
+  .add_int64_axis("num_chunks", nvbench::range(1, 1, 1));
+
+NVBENCH_BENCH_TYPES(
+  BM_jsonlines_read_options,
+  NVBENCH_TYPE_AXES(nvbench::enum_type_list<row_selection::ALL>,
+                    nvbench::enum_type_list<normalize_single_quotes::NO>,
+                    nvbench::enum_type_list<normalize_whitespace::NO>,
+                    nvbench::enum_type_list<mixed_types_as_string::NO, mixed_types_as_string::YES>,
+                    nvbench::enum_type_list<recovery_mode::RECOVER_WITH_NULL, recovery_mode::FAIL>))
+  .set_name("jsonlines_reader_mixed_types_as_string")
+  .set_type_axes_names({"row_selection",
+                        "normalize_single_quotes",
+                        "normalize_whitespace",
+                        "mixed_types_as_string",
+                        "recovery_mode"})
+  .set_min_samples(6)
+  .add_int64_axis("num_chunks", nvbench::range(1, 1, 1));
+
 NVBENCH_BENCH_TYPES(
   BM_jsonlines_read_options,
-  NVBENCH_TYPE_AXES(
-    nvbench::enum_type_list<row_selection::ALL, row_selection::BYTE_RANGE>,
-    nvbench::enum_type_list<normalize_single_quotes::NO, normalize_single_quotes::YES>,
-    nvbench::enum_type_list<normalize_whitespace::NO, normalize_whitespace::YES>,
-    nvbench::enum_type_list<mixed_types_as_string::NO, mixed_types_as_string::YES>,
-    nvbench::enum_type_list<recovery_mode::RECOVER_WITH_NULL, recovery_mode::FAIL>))
-  .set_name("jsonlines_reader")
+  NVBENCH_TYPE_AXES(nvbench::enum_type_list<row_selection::ALL, row_selection::BYTE_RANGE>,
+                    nvbench::enum_type_list<normalize_single_quotes::NO>,
+                    nvbench::enum_type_list<normalize_whitespace::NO>,
+                    nvbench::enum_type_list<mixed_types_as_string::NO>,
+                    nvbench::enum_type_list<recovery_mode::FAIL>))
+  .set_name("jsonlines_reader_row_selection")
   .set_type_axes_names({"row_selection",
                         "normalize_single_quotes",
                         "normalize_whitespace",

From 57e534a74f89086396479b548e4b02458e3a1bc2 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 8 May 2024 13:34:48 -1000
Subject: [PATCH 191/272] Misc Column cleanups (#15682)

* Some typing
* Moved a single use helper function inline
* Some dtype checking simplification

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15682
---
 python/cudf/cudf/core/column/column.py        | 24 +++++++++----------
 .../cudf/cudf/core/column/numerical_base.py   | 22 +++++++----------
 2 files changed, 19 insertions(+), 27 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index e23da59b883..3754ed1392e 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -56,7 +56,6 @@
     infer_dtype,
     is_bool_dtype,
     is_dtype_equal,
-    is_integer_dtype,
     is_scalar,
     is_string_dtype,
 )
@@ -606,7 +605,8 @@ def _scatter_by_slice(
         start, stop, step = key.indices(len(self))
         if start >= stop:
             return None
-        num_keys = len(range(start, stop, step))
+        rng = range(start, stop, step)
+        num_keys = len(rng)
 
         self._check_scatter_key_length(num_keys, value)
 
@@ -625,7 +625,7 @@ def _scatter_by_slice(
 
         # step != 1, create a scatter map with arange
         scatter_map = as_column(
-            range(start, stop, step),
+            rng,
             dtype=cudf.dtype(np.int32),
         )
 
@@ -672,18 +672,16 @@ def _scatter_by_column(
 
     def _check_scatter_key_length(
         self, num_keys: int, value: Union[cudf.core.scalar.Scalar, ColumnBase]
-    ):
+    ) -> None:
         """`num_keys` is the number of keys to scatter. Should equal to the
         number of rows in ``value`` if ``value`` is a column.
         """
-        if isinstance(value, ColumnBase):
-            if len(value) != num_keys:
-                msg = (
-                    f"Size mismatch: cannot set value "
-                    f"of size {len(value)} to indexing result of size "
-                    f"{num_keys}"
-                )
-                raise ValueError(msg)
+        if isinstance(value, ColumnBase) and len(value) != num_keys:
+            raise ValueError(
+                f"Size mismatch: cannot set value "
+                f"of size {len(value)} to indexing result of size "
+                f"{num_keys}"
+            )
 
     def fillna(
         self,
@@ -820,7 +818,7 @@ def take(
 
         # TODO: For performance, the check and conversion of gather map should
         # be done by the caller. This check will be removed in future release.
-        if not is_integer_dtype(indices.dtype):
+        if indices.dtype.kind not in {"u", "i"}:
             indices = indices.astype(libcudf.types.size_type_dtype)
         if not libcudf.copying._gather_map_is_valid(
             indices, len(self), check_bounds, nullify
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index c45a9c7fd5d..541c32a2520 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 """Define an interface for columns that can perform numerical operations."""
 
 from __future__ import annotations
@@ -112,7 +112,13 @@ def quantile(
                 ),
             )
         else:
-            result = self._numeric_quantile(q, interpolation, exact)
+            # get sorted indices and exclude nulls
+            indices = libcudf.sort.order_by(
+                [self], [True], "first", stable=True
+            ).slice(self.null_count, len(self))
+            result = libcudf.quantiles.quantile(
+                self, q, interpolation, indices, exact
+            )
         if return_scalar:
             scalar_result = result.element_indexing(0)
             if interpolation in {"lower", "higher", "nearest"}:
@@ -178,18 +184,6 @@ def median(self, skipna: Optional[bool] = None) -> NumericalBaseColumn:
             return_scalar=True,
         )
 
-    def _numeric_quantile(
-        self, q: np.ndarray, interpolation: str, exact: bool
-    ) -> NumericalBaseColumn:
-        # get sorted indices and exclude nulls
-        indices = libcudf.sort.order_by(
-            [self], [True], "first", stable=True
-        ).slice(self.null_count, len(self))
-
-        return libcudf.quantiles.quantile(
-            self, q, interpolation, indices, exact
-        )
-
     def cov(self, other: NumericalBaseColumn) -> float:
         if (
             len(self) == 0

From b09e794f8323051d8fdb5ed2a6bae25e92475665 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 8 May 2024 18:54:05 -0500
Subject: [PATCH 192/272] Add proxy for inplace operations in `cudf.pandas`
 (#15695)

Fixes: #15676

This PR implements `__iadd__` and `__isub__` methods to allow in-place subtraction and addition operations.

Forks out from #14534

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15695
---
 python/cudf/cudf/pandas/fast_slow_proxy.py    | 14 +++++
 .../cudf_pandas_tests/test_cudf_pandas.py     | 56 +++++++++++++++++++
 2 files changed, 70 insertions(+)

diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index c66458077fa..f91cdeac149 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -1083,9 +1083,23 @@ def _replace_closurevars(
     # Added on a per-proxy basis
     # https://github.com/rapidsai/xdf/pull/306#pullrequestreview-1636155428
     # "__hash__",
+    "__iadd__",
+    "__iand__",
+    "__iconcat__",
+    "__ifloordiv__",
+    "__ilshift__",
+    "__imatmul__",
+    "__imod__",
+    "__imul__",
     "__int__",
     "__invert__",
+    "__ior__",
+    "__ipow__",
+    "__irshift__",
+    "__isub__",
     "__iter__",
+    "__itruediv__",
+    "__ixor__",
     "__le__",
     "__len__",
     "__lshift__",
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index aa937d3ed4f..dcba1edd5fe 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1243,6 +1243,62 @@ def my_apply(df, unused):
     tm.assert_series_equal(result, expected)
 
 
+@pytest.mark.parametrize(
+    "op",
+    [
+        "__iadd__",
+        "__iand__",
+        "__ifloordiv__",
+        "__imod__",
+        "__imul__",
+        "__ior__",
+        "__ipow__",
+        "__isub__",
+        "__itruediv__",
+        "__ixor__",
+    ],
+)
+def test_inplace_ops(op):
+    xdf1 = xpd.DataFrame({"a": [10, 11, 12]})
+    xdf2 = xpd.DataFrame({"a": [1, 2, 3]})
+
+    df1 = pd.DataFrame({"a": [10, 11, 12]})
+    df2 = pd.DataFrame({"a": [1, 2, 3]})
+
+    actual = getattr(xdf1, op)(xdf2)
+    expected = getattr(df1, op)(df2)
+
+    tm.assert_equal(actual, expected)
+
+
+@pytest.mark.parametrize(
+    "op",
+    [
+        "__iadd__",
+        "__iand__",
+        "__ifloordiv__",
+        "__imod__",
+        "__imul__",
+        "__ior__",
+        "__ipow__",
+        "__isub__",
+        "__itruediv__",
+        "__ixor__",
+    ],
+)
+def test_inplace_ops_series(op):
+    xser1 = xpd.Series([10, 11, 12])
+    xser2 = xpd.Series([1, 2, 3])
+
+    ser1 = pd.Series([10, 11, 12])
+    ser2 = pd.Series([1, 2, 3])
+
+    actual = getattr(xser1, op)(xser2)
+    expected = getattr(ser1, op)(ser2)
+
+    tm.assert_equal(actual, expected)
+
+
 @pytest.mark.parametrize("data", [pd.NaT, 1234, "nat"])
 def test_timestamp(data):
     xtimestamp = xpd.Timestamp(data)

From c0c38ebf2e204da7e5c615453c87e3a0a8c31d0c Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 8 May 2024 13:56:01 -1000
Subject: [PATCH 193/272] Delay materializing RangeIndex in .reset_index
 (#15588)

Before, a `RangeIndex` would be materialized to a `Column` even if it wasn't used (`drop=True`). Now, it's only materialized if the index will be added as a new column in the DataFrame.

Also caught a validation bug where an `invalid` number of levels would not raise an error

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15588
---
 python/cudf/cudf/core/_base_index.py     | 23 +++++---
 python/cudf/cudf/core/index.py           |  8 +++
 python/cudf/cudf/core/indexed_frame.py   | 35 +++++------
 python/cudf/cudf/core/multiindex.py      | 74 ++++++++++++++++--------
 python/cudf/cudf/tests/test_dataframe.py |  8 +++
 5 files changed, 94 insertions(+), 54 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index d2534acd2dc..6c116e740ff 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -4,6 +4,7 @@
 
 import pickle
 import warnings
+from collections.abc import Generator
 from functools import cached_property
 from typing import Any, Literal, Set, Tuple
 
@@ -2190,14 +2191,20 @@ def repeat(self, repeats, axis=None):
         """
         raise NotImplementedError
 
-    def _split_columns_by_levels(self, levels):
-        if isinstance(levels, int) and levels > 0:
-            raise ValueError(f"Out of bound level: {levels}")
-        return (
-            [self._data[self.name]],
-            [],
-            ["index" if self.name is None else self.name],
-            [],
+    def _new_index_for_reset_index(
+        self, levels: tuple | None, name
+    ) -> None | BaseIndex:
+        """Return the new index after .reset_index"""
+        # None is caught later to return RangeIndex
+        return None
+
+    def _columns_for_reset_index(
+        self, levels: tuple | None
+    ) -> Generator[tuple[Any, ColumnBase], None, None]:
+        """Return the columns and column names for .reset_index"""
+        yield (
+            "index" if self.name is None else self.name,
+            next(iter(self._columns)),
         )
 
     def _split(self, splits):
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 35afe6ee949..096b6f17c1d 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -5,6 +5,7 @@
 import operator
 import pickle
 import warnings
+from collections.abc import Generator
 from functools import cache, cached_property
 from numbers import Number
 from typing import (
@@ -970,6 +971,13 @@ def __abs__(self) -> Self | Index:
         else:
             return abs(self._as_int_index())
 
+    def _columns_for_reset_index(
+        self, levels: tuple | None
+    ) -> Generator[tuple[Any, ColumnBase], None, None]:
+        """Return the columns and column names for .reset_index"""
+        # We need to explicitly materialize the RangeIndex to a column
+        yield "index" if self.name is None else self.name, as_column(self)
+
     @_warn_no_dask_cudf
     def __dask_tokenize__(self):
         return (type(self), self.start, self.stop, self.step)
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index e656fd49758..dc261707867 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -4340,34 +4340,27 @@ def take(self, indices, axis=0):
 
     def _reset_index(self, level, drop, col_level=0, col_fill=""):
         """Shared path for DataFrame.reset_index and Series.reset_index."""
-        if level is not None and not isinstance(level, (tuple, list)):
-            level = (level,)
+        if level is not None:
+            if (
+                isinstance(level, int)
+                and level > 0
+                and not isinstance(self.index, MultiIndex)
+            ):
+                raise IndexError(
+                    f"Too many levels: Index has only 1 level, not {level + 1}"
+                )
+            if not isinstance(level, (tuple, list)):
+                level = (level,)
         _check_duplicate_level_names(level, self._index.names)
 
-        # Split the columns in the index into data and index columns
-        (
-            data_columns,
-            index_columns,
-            data_names,
-            index_names,
-        ) = self._index._split_columns_by_levels(level)
-        if index_columns:
-            index = _index_from_data(
-                dict(enumerate(index_columns)),
-                name=self._index.name,
-            )
-            if isinstance(index, MultiIndex):
-                index.names = index_names
-            else:
-                index.name = index_names[0]
-        else:
+        index = self.index._new_index_for_reset_index(level, self.index.name)
+        if index is None:
             index = RangeIndex(len(self))
-
         if drop:
             return self._data, index
 
         new_column_data = {}
-        for name, col in zip(data_names, data_columns):
+        for name, col in self.index._columns_for_reset_index(level):
             if name == "index" and "index" in self._data:
                 name = "level_0"
             name = (
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index c3184f51a4c..58a2846bf43 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -8,6 +8,7 @@
 import pickle
 import warnings
 from collections import abc
+from collections.abc import Generator
 from functools import cached_property
 from numbers import Integral
 from typing import Any, List, MutableMapping, Tuple, Union
@@ -2052,41 +2053,64 @@ def _copy_type_metadata(
         return res
 
     @_cudf_nvtx_annotate
-    def _split_columns_by_levels(self, levels):
+    def _split_columns_by_levels(
+        self, levels: tuple, *, in_levels: bool
+    ) -> Generator[tuple[Any, column.ColumnBase], None, None]:
         # This function assumes that for levels with duplicate names, they are
         # specified by indices, not name by ``levels``. E.g. [None, None] can
         # only be specified by 0, 1, not "None".
-
-        if levels is None:
-            return (
-                list(self._data.columns),
-                [],
-                [
-                    f"level_{i}" if name is None else name
-                    for i, name in enumerate(self.names)
-                ],
-                [],
-            )
-
-        # Normalize named levels into indices
         level_names = list(self.names)
         level_indices = {
             lv if isinstance(lv, int) else level_names.index(lv)
             for lv in levels
         }
-
-        # Split the columns
-        data_columns, index_columns = [], []
-        data_names, index_names = [], []
         for i, (name, col) in enumerate(zip(self.names, self._data.columns)):
-            if i in level_indices:
+            if in_levels and i in level_indices:
                 name = f"level_{i}" if name is None else name
-                data_columns.append(col)
-                data_names.append(name)
-            else:
-                index_columns.append(col)
-                index_names.append(name)
-        return data_columns, index_columns, data_names, index_names
+                yield name, col
+            elif not in_levels and i not in level_indices:
+                yield name, col
+
+    @_cudf_nvtx_annotate
+    def _new_index_for_reset_index(
+        self, levels: tuple | None, name
+    ) -> None | BaseIndex:
+        """Return the new index after .reset_index"""
+        if levels is None:
+            return None
+
+        index_columns, index_names = [], []
+        for name, col in self._split_columns_by_levels(
+            levels, in_levels=False
+        ):
+            index_columns.append(col)
+            index_names.append(name)
+
+        if not index_columns:
+            # None is caught later to return RangeIndex
+            return None
+
+        index = cudf.core.index._index_from_data(
+            dict(enumerate(index_columns)),
+            name=name,
+        )
+        if isinstance(index, type(self)):
+            index.names = index_names
+        else:
+            index.name = index_names[0]
+        return index
+
+    def _columns_for_reset_index(
+        self, levels: tuple | None
+    ) -> Generator[tuple[Any, column.ColumnBase], None, None]:
+        """Return the columns and column names for .reset_index"""
+        if levels is None:
+            for i, (col, name) in enumerate(
+                zip(self._data.columns, self.names)
+            ):
+                yield f"level_{i}" if name is None else name, col
+        else:
+            yield from self._split_columns_by_levels(levels, in_levels=True)
 
     def repeat(self, repeats, axis=None):
         return self._from_data(
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 2dee3566e1b..20e9f41de63 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -3208,6 +3208,14 @@ def test_reset_index_unnamed(
     assert_eq(expect, got)
 
 
+def test_reset_index_invalid_level():
+    with pytest.raises(IndexError):
+        cudf.DataFrame([1]).reset_index(level=2)
+
+    with pytest.raises(IndexError):
+        pd.DataFrame([1]).reset_index(level=2)
+
+
 @pytest.mark.parametrize(
     "data",
     [

From c576e97a6a7afef225e9c7746885ac436f224ee3 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 8 May 2024 20:01:31 -0500
Subject: [PATCH 194/272] Enabled `Holiday` types in `cudf.pandas` (#15664)

Fixes: #15663

This PR enables `Holiday` types in `cudf.pandas` by also adding a utility to create a composite meta class.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15664
---
 python/cudf/cudf/pandas/_wrappers/pandas.py   | 135 +++++++++++++++++-
 python/cudf/cudf/pandas/fast_slow_proxy.py    |  21 ++-
 .../cudf_pandas_tests/test_cudf_pandas.py     |  74 ++++++++++
 3 files changed, 228 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 93bef66de4f..de92cce8ebb 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -7,6 +7,21 @@
 import sys
 
 import pandas as pd
+from pandas.tseries.holiday import (
+    AbstractHolidayCalendar as pd_AbstractHolidayCalendar,
+    EasterMonday as pd_EasterMonday,
+    GoodFriday as pd_GoodFriday,
+    Holiday as pd_Holiday,
+    HolidayCalendarFactory as pd_HolidayCalendarFactory,
+    HolidayCalendarMetaClass as pd_HolidayCalendarMetaClass,
+    USColumbusDay as pd_USColumbusDay,
+    USFederalHolidayCalendar as pd_USFederalHolidayCalendar,
+    USLaborDay as pd_USLaborDay,
+    USMartinLutherKingJr as pd_USMartinLutherKingJr,
+    USMemorialDay as pd_USMemorialDay,
+    USPresidentsDay as pd_USPresidentsDay,
+    USThanksgivingDay as pd_USThanksgivingDay,
+)
 
 import cudf
 
@@ -37,7 +52,6 @@
     XportReader as pd_XportReader,
 )
 
-
 # TODO(pandas2.1): Can import from pandas.api.typing
 from pandas.core.resample import (  # isort: skip
     Resampler as pd_Resampler,
@@ -882,6 +896,125 @@ def _df_query_method(self, *args, local_dict=None, global_dict=None, **kwargs):
     "_SAS7BDATReader", _Unusable, pd_SAS7BDATReader
 )
 
+USFederalHolidayCalendar = make_final_proxy_type(
+    "USFederalHolidayCalendar",
+    _Unusable,
+    pd_USFederalHolidayCalendar,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+HolidayCalendarMetaClass = make_final_proxy_type(
+    "HolidayCalendarMetaClass",
+    _Unusable,
+    pd_HolidayCalendarMetaClass,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+
+@register_proxy_func(pd_HolidayCalendarFactory)
+def holiday_calendar_factory_wrapper(*args, **kwargs):
+    # Call the original HolidayCalendarFactory
+    result = _FunctionProxy(_Unusable(), pd_HolidayCalendarFactory)(
+        *args, **kwargs
+    )
+    # Return the slow proxy of the result
+    return result._fsproxy_slow
+
+
+AbstractHolidayCalendar = make_final_proxy_type(
+    "AbstractHolidayCalendar",
+    _Unusable,
+    pd_AbstractHolidayCalendar,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+    meta_class=pd_HolidayCalendarMetaClass,
+)
+
+Holiday = make_final_proxy_type(
+    "Holiday",
+    _Unusable,
+    pd_Holiday,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+USThanksgivingDay = make_final_proxy_type(
+    "USThanksgivingDay",
+    _Unusable,
+    pd_USThanksgivingDay,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+USColumbusDay = make_final_proxy_type(
+    "USColumbusDay",
+    _Unusable,
+    pd_USColumbusDay,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+USLaborDay = make_final_proxy_type(
+    "USLaborDay",
+    _Unusable,
+    pd_USLaborDay,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+USMemorialDay = make_final_proxy_type(
+    "USMemorialDay",
+    _Unusable,
+    pd_USMemorialDay,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+USMartinLutherKingJr = make_final_proxy_type(
+    "USMartinLutherKingJr",
+    _Unusable,
+    pd_USMartinLutherKingJr,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+USPresidentsDay = make_final_proxy_type(
+    "USPresidentsDay",
+    _Unusable,
+    pd_USPresidentsDay,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+
+GoodFriday = make_final_proxy_type(
+    "GoodFriday",
+    _Unusable,
+    pd_GoodFriday,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+EasterMonday = make_final_proxy_type(
+    "EasterMonday",
+    _Unusable,
+    pd_EasterMonday,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
 
 FY5253 = make_final_proxy_type(
     "FY5253",
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index f91cdeac149..e5c86d2318e 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -103,6 +103,19 @@ def __call__(self):
 _DELETE = object()
 
 
+def create_composite_metaclass(base_meta, additional_meta):
+    """
+    Dynamically creates a composite metaclass that inherits from both provided metaclasses.
+    This ensures that the metaclass behaviors of both base_meta and additional_meta are preserved.
+    """
+
+    class CompositeMeta(base_meta, additional_meta):
+        def __new__(cls, name, bases, namespace):
+            return super().__new__(cls, name, bases, namespace)
+
+    return CompositeMeta
+
+
 def make_final_proxy_type(
     name: str,
     fast_type: type,
@@ -114,6 +127,7 @@ def make_final_proxy_type(
     additional_attributes: Mapping[str, Any] | None = None,
     postprocess: Callable[[_FinalProxy, Any, Any], Any] | None = None,
     bases: Tuple = (),
+    meta_class=None,
 ) -> Type[_FinalProxy]:
     """
     Defines a fast-slow proxy type for a pair of "final" fast and slow
@@ -217,10 +231,15 @@ def _fsproxy_state(self) -> _State:
         elif v is not _DELETE:
             cls_dict[k] = v
 
+    if meta_class is None:
+        meta_class = _FastSlowProxyMeta
+    else:
+        meta_class = create_composite_metaclass(_FastSlowProxyMeta, meta_class)
+
     cls = types.new_class(
         name,
         (*bases, _FinalProxy),
-        {"metaclass": _FastSlowProxyMeta},
+        {"metaclass": meta_class},
         lambda ns: ns.update(cls_dict),
     )
     functools.update_wrapper(
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index dcba1edd5fe..9fb0891fa52 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -16,6 +16,7 @@
 import pyarrow as pa
 import pytest
 from numba import NumbaDeprecationWarning
+from pytz import utc
 
 from cudf.pandas import LOADED, Profiler
 from cudf.pandas.fast_slow_proxy import _Unusable
@@ -25,6 +26,19 @@
 
 import pandas as xpd
 import pandas._testing as tm
+from pandas.tseries.holiday import (
+    AbstractHolidayCalendar,
+    EasterMonday,
+    GoodFriday,
+    Holiday,
+    USColumbusDay,
+    USLaborDay,
+    USMartinLutherKingJr,
+    USMemorialDay,
+    USPresidentsDay,
+    USThanksgivingDay,
+    get_calendar,
+)
 
 # Accelerated pandas has the real pandas module as an attribute
 pd = xpd._fsproxy_slow
@@ -1311,3 +1325,63 @@ def test_timedelta(data):
     xtimedelta = xpd.Timedelta(data)
     timedelta = pd.Timedelta(data)
     tm.assert_equal(xtimedelta, timedelta)
+
+
+def test_abstract_holiday_calendar():
+    class TestCalendar(AbstractHolidayCalendar):
+        def __init__(self, name=None, rules=None) -> None:
+            super().__init__(name=name, rules=rules)
+
+    jan1 = TestCalendar(rules=[Holiday("jan1", year=2015, month=1, day=1)])
+    jan2 = TestCalendar(rules=[Holiday("jan2", year=2015, month=1, day=2)])
+
+    # Getting holidays for Jan 1 should not alter results for Jan 2.
+    expected = xpd.DatetimeIndex(["01-Jan-2015"]).as_unit("ns")
+    tm.assert_index_equal(jan1.holidays(), expected)
+
+    expected2 = xpd.DatetimeIndex(["02-Jan-2015"]).as_unit("ns")
+    tm.assert_index_equal(jan2.holidays(), expected2)
+
+
+@pytest.mark.parametrize(
+    "holiday,start,expected",
+    [
+        (USMemorialDay, datetime.datetime(2015, 7, 1), []),
+        (USLaborDay, "2015-09-07", [xpd.Timestamp("2015-09-07")]),
+        (USColumbusDay, "2015-10-12", [xpd.Timestamp("2015-10-12")]),
+        (USThanksgivingDay, "2015-11-26", [xpd.Timestamp("2015-11-26")]),
+        (USMartinLutherKingJr, "2015-01-19", [xpd.Timestamp("2015-01-19")]),
+        (USPresidentsDay, datetime.datetime(2015, 7, 1), []),
+        (GoodFriday, datetime.datetime(2015, 7, 1), []),
+        (EasterMonday, "2015-04-06", [xpd.Timestamp("2015-04-06")]),
+        ("New Year's Day", "2010-12-31", [xpd.Timestamp("2010-12-31")]),
+        ("Independence Day", "2015-07-03", [xpd.Timestamp("2015-07-03")]),
+        ("Veterans Day", "2012-11-11", []),
+        ("Christmas Day", "2011-12-26", [xpd.Timestamp("2011-12-26")]),
+        (
+            "Juneteenth National Independence Day",
+            "2021-06-18",
+            [xpd.Timestamp("2021-06-18")],
+        ),
+        ("Juneteenth National Independence Day", "2022-06-19", []),
+        (
+            "Juneteenth National Independence Day",
+            "2022-06-20",
+            [xpd.Timestamp("2022-06-20")],
+        ),
+    ],
+)
+def test_holidays_within_dates(holiday, start, expected):
+    if isinstance(holiday, str):
+        calendar = get_calendar("USFederalHolidayCalendar")
+        holiday = calendar.rule_from_name(holiday)
+
+    assert list(holiday.dates(start, start)) == expected
+
+    # Verify that timezone info is preserved.
+    assert list(
+        holiday.dates(
+            utc.localize(xpd.Timestamp(start)),
+            utc.localize(xpd.Timestamp(start)),
+        )
+    ) == [utc.localize(dt) for dt in expected]

From 776756142fe7119a6e9b24158ec441011ef9d2b4 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 9 May 2024 08:19:40 -0400
Subject: [PATCH 195/272] Rework get_json_object benchmark to use nvbench
 (#15698)

Moves google-benchmark for `cudf::get_json_object` to nvbench.
Also removes randomness from device code to help ensure consistent results if the test input data generator logic is changed.
Updating the `detail::make_strings_children` function produced different input data so the performance results could not be compared. Removing the device code randomness helps keep the input data consistent.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/15698
---
 cpp/benchmarks/CMakeLists.txt |  2 +-
 cpp/benchmarks/json/json.cu   | 73 ++++++++++++++++-------------------
 2 files changed, 34 insertions(+), 41 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 7e61d881f07..ac4cce02318 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -330,7 +330,7 @@ ConfigureNVBench(
 
 # ##################################################################################################
 # * json benchmark -------------------------------------------------------------------
-ConfigureBench(JSON_BENCH json/json.cu)
+ConfigureNVBench(JSON_NVBENCH json/json.cu)
 ConfigureNVBench(FST_NVBENCH io/fst.cu)
 ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader_input.cpp)
 ConfigureNVBench(JSON_READER_OPTION_NVBENCH io/json/json_reader_option.cpp)
diff --git a/cpp/benchmarks/json/json.cu b/cpp/benchmarks/json/json.cu
index c65db187f42..eee85f3feeb 100644
--- a/cpp/benchmarks/json/json.cu
+++ b/cpp/benchmarks/json/json.cu
@@ -15,8 +15,6 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
@@ -28,9 +26,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 
-#include <thrust/random.h>
-
-class JsonPath : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
 std::vector<std::string> const Books{
   R"json({
@@ -80,8 +76,6 @@ struct json_benchmark_row_builder {
   cudf::size_type* d_sizes{};
   char* d_chars{};
   cudf::detail::input_offsetalator d_offsets;
-  thrust::minstd_rand rng{5236};
-  thrust::uniform_int_distribution<int> dist{};
 
   // internal data structure for {bytes, out_ptr} with operator+=
   struct bytes_and_ptr {
@@ -99,12 +93,10 @@ struct json_benchmark_row_builder {
                                     cudf::size_type num_items,
                                     bytes_and_ptr& output_str)
   {
-    using param_type = thrust::uniform_int_distribution<int>::param_type;
-    dist.param(param_type{0, d_books_bicycles[this_idx].size() - 1});
     cudf::string_view comma(",\n", 2);
     for (int i = 0; i < num_items; i++) {
       if (i > 0) { output_str += comma; }
-      int idx   = dist(rng);
+      int idx   = threadIdx.x % d_books_bicycles[this_idx].size();
       auto item = d_books_bicycles[this_idx].element<cudf::string_view>(idx);
       output_str += item;
     }
@@ -183,41 +175,42 @@ auto build_json_string_column(int desired_bytes, int num_rows)
   return cudf::make_strings_column(num_rows, std::move(offsets), chars.release(), 0, {});
 }
 
-void BM_case(benchmark::State& state, std::string query_arg)
+static std::string queries[] = {"$",
+                                "$.store",
+                                "$.store.book",
+                                "$.store.*",
+                                "$.store.book[*]",
+                                "$.store.book[*].category",
+                                "$.store['bicycle']",
+                                "$.store.book[*]['isbn']",
+                                "$.store.bicycle[1]"};
+
+static void bench_query(nvbench::state& state)
 {
   srand(5236);
-  int num_rows      = state.range(0);
-  int desired_bytes = state.range(1);
+
+  auto const num_rows      = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const desired_bytes = static_cast<cudf::size_type>(state.get_int64("bytes"));
+  auto const query         = state.get_int64("query");
+  auto const json_path     = queries[query];
+
+  auto const stream = cudf::get_default_stream();
   auto input        = build_json_string_column(desired_bytes, num_rows);
   cudf::strings_column_view scv(input->view());
-  size_t num_chars = scv.chars_size(cudf::get_default_stream());
+  size_t num_chars = scv.chars_size(stream);
 
-  std::string json_path(query_arg);
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);
-    auto result = cudf::get_json_object(scv, json_path);
-    CUDF_CUDA_TRY(cudaStreamSynchronize(0));
-  }
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  // This isn't strictly 100% accurate. a given query isn't necessarily
+  // going to visit every single incoming character but in spirit it does.
+  state.add_global_memory_reads<nvbench::int8_t>(num_chars);
 
-  // this isn't strictly 100% accurate. a given query isn't necessarily
-  // going to visit every single incoming character.  but in spirit it does.
-  state.SetBytesProcessed(state.iterations() * num_chars);
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    [[maybe_unused]] auto result = cudf::get_json_object(scv, json_path);
+  });
 }
 
-#define JSON_BENCHMARK_DEFINE(name, query)                                                  \
-  BENCHMARK_DEFINE_F(JsonPath, name)(::benchmark::State & state) { BM_case(state, query); } \
-  BENCHMARK_REGISTER_F(JsonPath, name)                                                      \
-    ->ArgsProduct({{100, 1000, 100000, 400000}, {300, 600, 4096}})                          \
-    ->UseManualTime()                                                                       \
-    ->Unit(benchmark::kMillisecond);
-
-JSON_BENCHMARK_DEFINE(query0, "$");
-JSON_BENCHMARK_DEFINE(query1, "$.store");
-JSON_BENCHMARK_DEFINE(query2, "$.store.book");
-JSON_BENCHMARK_DEFINE(query3, "$.store.*");
-JSON_BENCHMARK_DEFINE(query4, "$.store.book[*]");
-JSON_BENCHMARK_DEFINE(query5, "$.store.book[*].category");
-JSON_BENCHMARK_DEFINE(query6, "$.store['bicycle']");
-JSON_BENCHMARK_DEFINE(query7, "$.store.book[*]['isbn']");
-JSON_BENCHMARK_DEFINE(query8, "$.store.bicycle[1]");
+NVBENCH_BENCH(bench_query)
+  .set_name("json_path")
+  .add_int64_axis("bytes", {300, 600, 4096})
+  .add_int64_axis("num_rows", {100, 1000, 100000, 400000})
+  .add_int64_axis("query", {0, 1, 2, 3, 4, 5, 6, 7, 8});

From a4cd1d877631e4554c53b57202564398b758324c Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Thu, 9 May 2024 08:31:25 -0500
Subject: [PATCH 196/272] Remove obsolete `XFAIL` markers for query-planning
 (#15662)

Simple PR that removes/modifies several `XFAIL` markers for tests that *should* be passing with the latest version of `dask`. Note that the `lt_version="2024.5.0"` argument used in many places is conservative for most tests.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: https://github.com/rapidsai/cudf/pull/15662
---
 .../dask_cudf/io/tests/test_parquet.py        |  2 +-
 .../dask_cudf/tests/test_accessor.py          |  2 +-
 python/dask_cudf/dask_cudf/tests/test_core.py |  6 +-
 .../dask_cudf/dask_cudf/tests/test_groupby.py | 58 +++----------------
 .../dask_cudf/dask_cudf/tests/test_onehot.py  |  4 +-
 5 files changed, 17 insertions(+), 55 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 6f4737db5be..2c44f192612 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -443,7 +443,7 @@ def test_create_metadata_file(tmpdir, partition_on):
     dd.assert_eq(ddf1, ddf2)
 
 
-@xfail_dask_expr("dtypes are inconsistent")
+@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 @need_create_meta
 def test_create_metadata_file_inconsistent_schema(tmpdir):
     # NOTE: This test demonstrates that the CudfEngine
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index 035b73094e7..58d28f0597e 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -111,7 +111,7 @@ def test_categorical_accessor_initialization2(data):
         dsr.cat
 
 
-@xfail_dask_expr(lt_version="2024.5.0")
+@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 @pytest.mark.parametrize("data", [data_cat_1()])
 def test_categorical_basic(data):
     cat = data.copy()
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 4878d44d636..981c2c369f1 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -800,7 +800,7 @@ def test_dataframe_set_index():
         assert_eq(ddf.compute(), pddf.compute())
 
 
-@xfail_dask_expr("Insufficient describe support in dask-expr")
+@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 def test_series_describe():
     random.seed(0)
     sr = cudf.datasets.randomdata(20)["x"]
@@ -816,7 +816,7 @@ def test_series_describe():
     )
 
 
-@xfail_dask_expr("Insufficient describe support in dask-expr")
+@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 def test_dataframe_describe():
     random.seed(0)
     df = cudf.datasets.randomdata(20)
@@ -830,7 +830,7 @@ def test_dataframe_describe():
     )
 
 
-@xfail_dask_expr("Insufficient describe support in dask-expr")
+@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 def test_zero_std_describe():
     num = 84886781
     df = cudf.DataFrame(
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index f96b5b760d8..dc279bfa690 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -288,64 +288,24 @@ def test_groupby_dropna_cudf(dropna, by):
         (False, "a"),
         (False, "b"),
         (False, "c"),
-        pytest.param(
-            False,
-            "d",
-            marks=pytest.mark.xfail(
-                reason="dropna=False is broken in Dask CPU for groupbys on "
-                "categorical columns"
-            ),
-        ),
-        pytest.param(
-            False,
-            ["a", "b"],
-            marks=pytest.mark.xfail(
-                reason="https://github.com/dask/dask/issues/8817"
-            ),
-        ),
-        pytest.param(
-            False,
-            ["a", "c"],
-            marks=pytest.mark.xfail(
-                reason="https://github.com/dask/dask/issues/8817"
-            ),
-        ),
-        pytest.param(
-            False,
-            ["a", "d"],
-            marks=pytest.mark.xfail(
-                reason="multi-col groupbys on categorical columns are broken "
-                "in Dask CPU"
-            ),
-        ),
+        (False, "d"),
+        (False, ["a", "b"]),
+        (False, ["a", "c"]),
+        (False, ["a", "d"]),
         (True, "a"),
         (True, "b"),
         (True, "c"),
         (True, "d"),
         (True, ["a", "b"]),
         (True, ["a", "c"]),
-        pytest.param(
-            True,
-            ["a", "d"],
-            marks=pytest.mark.xfail(
-                reason="multi-col groupbys on categorical columns are broken "
-                "in Dask CPU"
-            ),
-        ),
+        (True, ["a", "d"]),
         (None, "a"),
         (None, "b"),
         (None, "c"),
         (None, "d"),
         (None, ["a", "b"]),
         (None, ["a", "c"]),
-        pytest.param(
-            None,
-            ["a", "d"],
-            marks=pytest.mark.xfail(
-                reason="multi-col groupbys on categorical columns are broken "
-                "in Dask CPU"
-            ),
-        ),
+        (None, ["a", "d"]),
     ],
 )
 def test_groupby_dropna_dask(dropna, by):
@@ -675,7 +635,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
     dd.assert_eq(gf, pf)
 
 
-@xfail_dask_expr("Newer dask-expr version needed")
+@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 @pytest.mark.parametrize(
     "aggregations", [(sum, "sum"), (max, "max"), (min, "min")]
 )
@@ -714,7 +674,7 @@ def test_is_supported(arg, supported):
     assert _aggs_optimized(arg, OPTIMIZED_AGGS) is supported
 
 
-@xfail_dask_expr("Fails on older versions of dask-expr")
+@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 def test_groupby_unique_lists():
     df = pd.DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": [10, 10, 10, 7, 8, 9]})
     gdf = cudf.from_pandas(df)
@@ -776,7 +736,7 @@ def test_groupby_with_list_of_series():
     )
 
 
-@xfail_dask_expr("Nested renamer not supported in dask-expr")
+@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 @pytest.mark.parametrize(
     "func",
     [
diff --git a/python/dask_cudf/dask_cudf/tests/test_onehot.py b/python/dask_cudf/dask_cudf/tests/test_onehot.py
index 96646f85f74..0b7c7855e07 100644
--- a/python/dask_cudf/dask_cudf/tests/test_onehot.py
+++ b/python/dask_cudf/dask_cudf/tests/test_onehot.py
@@ -11,7 +11,9 @@
 from dask_cudf.tests.utils import xfail_dask_expr
 
 # No dask-expr support
-pytestmark = xfail_dask_expr("limited get_dummy support in dask-expr + cudf")
+pytestmark = xfail_dask_expr(
+    "Newer dask version needed", lt_version="2024.5.0"
+)
 
 
 def test_get_dummies_cat():

From 69fe21365c043fbd165ee050a576ece8830aea45 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Thu, 9 May 2024 12:16:40 -0500
Subject: [PATCH 197/272] Enable sorting on column with nulls using
 query-planning (#15639)

Related to https://github.com/rapidsai/cudf/issues/15027

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: https://github.com/rapidsai/cudf/pull/15639
---
 python/dask_cudf/dask_cudf/backends.py        | 2 +-
 python/dask_cudf/dask_cudf/tests/test_sort.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 94528325aea..d250589e389 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -307,7 +307,7 @@ def categorical_dtype_cudf(categories=None, ordered=False):
 @tolist_dispatch.register((cudf.Series, cudf.BaseIndex))
 @_dask_cudf_nvtx_annotate
 def tolist_cudf(obj):
-    return obj.to_arrow().to_pylist()
+    return obj.to_pandas().tolist()
 
 
 @is_categorical_dtype_dispatch.register(
diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py
index 9184ad996ad..400600a1598 100644
--- a/python/dask_cudf/dask_cudf/tests/test_sort.py
+++ b/python/dask_cudf/dask_cudf/tests/test_sort.py
@@ -72,7 +72,7 @@ def test_sort_repartition():
     dd.assert_eq(len(new_ddf), len(ddf))
 
 
-@xfail_dask_expr("dask-expr code path fails with nulls")
+@xfail_dask_expr("missing null support", lt_version="2024.5.1")
 @pytest.mark.parametrize("na_position", ["first", "last"])
 @pytest.mark.parametrize("ascending", [True, False])
 @pytest.mark.parametrize("by", ["a", "b", ["a", "b"]])

From 3481042d5d1a1f511515cf23f36c43620ad6663e Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 9 May 2024 12:18:31 -0500
Subject: [PATCH 198/272] Upgrade `arrow` to `16` (#15703)

This PR upgrades `arrow` to `16`. This PR also contains fixes to pytests because of breaking API changes in pyarrow.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: https://github.com/rapidsai/cudf/pull/15703
---
 .../all_cuda-118_arch-x86_64.yaml             |  12 ++++-----
 .../all_cuda-122_arch-x86_64.yaml             |  12 ++++-----
 conda/recipes/cudf/meta.yaml                  |   2 +-
 conda/recipes/libcudf/conda_build_config.yaml |   2 +-
 cpp/cmake/thirdparty/get_arrow.cmake          |   2 +-
 dependencies.yaml                             |  24 ++++++++----------
 python/cudf/cudf/io/parquet.py                |   5 ----
 .../tests/data/parquet/usec_timestamp.parquet | Bin 1128 -> 2323 bytes
 python/cudf/cudf/tests/test_dataframe.py      |  14 +---------
 python/cudf/cudf/tests/test_index.py          |   8 +-----
 python/cudf/cudf/tests/test_parquet.py        |   4 +--
 python/cudf/cudf/utils/ioutils.py             |   3 ++-
 python/cudf/pyproject.toml                    |   6 ++---
 python/cudf_kafka/pyproject.toml              |   2 +-
 .../dask_cudf/io/tests/test_parquet.py        |   4 +--
 15 files changed, 36 insertions(+), 64 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 7a5fef9f25e..48699b81eed 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -36,15 +36,15 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow-acero==14.0.2.*
-- libarrow-dataset==14.0.2.*
-- libarrow==14.0.2.*
+- libarrow-acero==16.0.0.*
+- libarrow-dataset==16.0.0.*
+- libarrow==16.0.0.*
 - libcufile-dev=1.4.0.31
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
 - libkvikio==24.6.*
-- libparquet==14.0.2.*
+- libparquet==16.0.0.*
 - librdkafka>=1.9.0,<1.10.0a0
 - librmm==24.6.*
 - make
@@ -66,7 +66,7 @@ dependencies:
 - pip
 - pre-commit
 - ptxcompiler
-- pyarrow==14.0.2.*
+- pyarrow==16.0.0.*
 - pydata-sphinx-theme!=0.14.2
 - pytest-benchmark
 - pytest-cases>=3.8.2
@@ -92,7 +92,7 @@ dependencies:
 - streamz
 - sysroot_linux-64==2.17
 - tokenizers==0.15.2
-- transformers==4.38.1
+- transformers==4.39.3
 - typing_extensions>=4.0.0
 - zlib>=1.2.13
 - pip:
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 48453e18bb0..d06a727f331 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -37,13 +37,13 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow-acero==14.0.2.*
-- libarrow-dataset==14.0.2.*
-- libarrow==14.0.2.*
+- libarrow-acero==16.0.0.*
+- libarrow-dataset==16.0.0.*
+- libarrow==16.0.0.*
 - libcufile-dev
 - libcurand-dev
 - libkvikio==24.6.*
-- libparquet==14.0.2.*
+- libparquet==16.0.0.*
 - librdkafka>=1.9.0,<1.10.0a0
 - librmm==24.6.*
 - make
@@ -63,7 +63,7 @@ dependencies:
 - pandoc
 - pip
 - pre-commit
-- pyarrow==14.0.2.*
+- pyarrow==16.0.0.*
 - pydata-sphinx-theme!=0.14.2
 - pynvjitlink
 - pytest-benchmark
@@ -90,7 +90,7 @@ dependencies:
 - streamz
 - sysroot_linux-64==2.17
 - tokenizers==0.15.2
-- transformers==4.38.1
+- transformers==4.39.3
 - typing_extensions>=4.0.0
 - zlib>=1.2.13
 - pip:
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index ddcadfd1570..24210830ada 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -65,7 +65,7 @@ requirements:
     - setuptools
     - dlpack >=0.8,<1.0
     - numpy 1.23
-    - pyarrow ==14.0.2.*
+    - pyarrow ==16.0.0.*
     - libcudf ={{ version }}
     - rmm ={{ minor_version }}
     {% if cuda_major == "11" %}
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index ba5e96fb6cf..61ffcf3c3de 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -20,7 +20,7 @@ cmake_version:
   - ">=3.26.4"
 
 libarrow_version:
-  - "==14.0.2"
+  - "==16.0.0"
 
 dlpack_version:
   - ">=0.8,<1.0"
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 892056959c8..70283efbd79 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -410,7 +410,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow)
   set(CUDF_VERSION_Arrow
       # This version must be kept in sync with the libarrow version pinned for builds in
       # dependencies.yaml.
-      14.0.2
+      16.0.0
       CACHE STRING "The version of Arrow to find (or build)"
   )
 endif()
diff --git a/dependencies.yaml b/dependencies.yaml
index 1508656471d..7fe67817f73 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -266,7 +266,7 @@ dependencies:
           - cython>=3.0.3
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - pyarrow==14.0.2.*
+          - pyarrow==16.0.0.*
       - output_types: conda
         packages:
           - scikit-build-core>=0.7.0
@@ -312,27 +312,25 @@ dependencies:
         packages:
           # Hard pin the Arrow patch version used during the build. This must
           # be kept in sync with the version pinned in get_arrow.cmake.
-          - libarrow-acero==14.0.2.*
-          - libarrow-dataset==14.0.2.*
-          - libarrow==14.0.2.*
-          - libparquet==14.0.2.*
+          - libarrow-acero==16.0.0.*
+          - libarrow-dataset==16.0.0.*
+          - libarrow==16.0.0.*
+          - libparquet==16.0.0.*
   libarrow_run:
     common:
       - output_types: conda
         packages:
           # Allow runtime version to float up to minor version
-          # Disallow libarrow 14.0.0 due to a CVE
-          - libarrow-acero>=14.0.1,<15.0.0a0
-          - libarrow-dataset>=14.0.1,<15.0.0a0
-          - libarrow>=14.0.1,<15.0.0a0
-          - libparquet>=14.0.1,<15.0.0a0
+          - libarrow-acero>=16.0.0,<17.0.0a0
+          - libarrow-dataset>=16.0.0,<17.0.0a0
+          - libarrow>=16.0.0,<17.0.0a0
+          - libparquet>=16.0.0,<17.0.0a0
   pyarrow_run:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
           # Allow runtime version to float up to minor version
-          # Disallow pyarrow 14.0.0 due to a CVE
-          - pyarrow>=14.0.1,<15.0.0a0
+          - pyarrow>=16.0.0,<17.0.0a0
   cuda_version:
     specific:
       - output_types: conda
@@ -631,7 +629,7 @@ dependencies:
         packages:
           - msgpack
           - &tokenizers tokenizers==0.15.2
-          - &transformers transformers==4.38.1
+          - &transformers transformers==4.39.3
           - tzdata
     specific:
       - output_types: conda
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index e7f1ad0751f..dd1e59acaaa 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -993,15 +993,10 @@ def to_parquet(
         if index is None:
             index = True
 
-        # Convert partition_file_name to a call back
-        if partition_file_name:
-            partition_file_name = lambda x: partition_file_name  # noqa: E731
-
         pa_table = df.to_arrow(preserve_index=index)
         return pq.write_to_dataset(
             pa_table,
             root_path=path,
-            partition_filename_cb=partition_file_name,
             partition_cols=partition_cols,
             *args,
             **kwargs,
diff --git a/python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet b/python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet
index 20ef3cc5578426c7fa4fabe32692f43d3cd1c66c..efde6ff11bf97254c84827f1fe2703035fae4c34 100644
GIT binary patch
literal 2323
zcmcguL2u$l6dnj6v}z=}!U!Y|$l67fO0qy8Evxh}kN~EH1hTP#)e3oR17>YQz=p6<
zj=l9z^|Z%TkL{(uqsN|BJyz<ir=EK38#^W;B!?c_NyamN@6GqV_uf2<Q`}55J|@mQ
zt-*qxAQU|K@cSPZv2Qj}<=s!IG^`9_LQEtHk09e`{IN*D)kFfWZ5Vy_fD|G0@l!86
zLmtoOkKV*@o*3hc=8!)ajd(=H8xDnk@q~Z%Br1u(zk!V=DOtu#sRdy{2;Sd&7`X-{
zX%MFfZF`VsJAk|}H(>f;Zo>4#+}aKt1mF-IA2|y}f+XAx($z#<Q$&MYmjXcWzzNc~
z8O*$#y#q<&=gr$tKkar^Lsl-knle+gZmQe0L;v6nzxO7-_a(mDfZ*4GA943k@X>=D
ziuk9}K+#3U2ibV$g#<qf)xLcaOWG0IqIDuD9E!%G$^Wo2|7#|~Kls9-L?HavMhKy3
zGU6YLhAd7ae{Wo|P6?!uX6eS1eGz}fu9by-os6@Ng;O!bHoRgDQCHxIVQCsz^i)NY
zyQ?zCQP*6IsUd3_sx*=n7OV|RA1{`Q5g96yN#(j?in3^mV1C6a2AHcxpPoH`S=#xM
zy}rIBv&w=M>G2g3j_4i-09Q;Eh_8`wHT&6D*7Wr~GBn8RByd^{aMj;>>cGi8|Egi`
zXV;xC#di{5`?TYoD55!<*g4TuG7%>Wnv(HCoJ>Y@3N}+FQ*~reX?lM*wVPsrtn2!Y
zGP4VR>@|vff}70~;f2#2kw3Sd)A4z~h%s)!q~mLhv$)X5C5-#H*tcbc!is)Qg-1*M
zA*Scy9fpl!>p6_)7|y|JgTU$<wP9D7w}srBcIz19(>|CLzTkAt>Xi9xD^s753U3nQ
zSUph-gwygp5o|lHL64OAOh>rv9hdSwPD@R>Vs%I|muVv8TpLYVaQj@z1iajU2<H^T
zl7-7@t%%i*u4$y)>=C_WcFw@AoJ*7PoZ{T@srTfHHn(-fYMSU8;5Ri+o%aDRO<ILq
zYp<D;$`lu?BvZZU2=$bsPiu0y1Ti%S+;G%a7FCVt^;xG3aZygiD$gIa&68p^-{V)f
zzoquV-c()kDpm&$W@ilXzG-FLSg_hY;~wKneFF!L#zt)*?(|&Y^C_iO(5MeiZw}-#
zpAv<%Rv$Kr+^WkDaVG{pB^BUUxx?u;SGp+h(@rUGwE@S*eXK?_Z$@E2z^|0P%R{r|
z!}d+y*7FMWPSuAEmDY?1e5x%_-bc$?y7`lI9_0DOO}?k1P+hcJ)tt@ADCA(jAbXOH
zBkTM^%4jp9(pqqrbu~M$4YArgTj7}Lw5o4KA#d3jL(V!@vkT5h(d+A6<&t~<F`k`5
z|IpV4XW#sqZREBR+;_T#yZX@mrKZq0&;Wht2HGb#<J?{wxX_^bGR6`<q$21Nr^#U7
z7}-uhNnh;ea$7#4{R?<)x;mqeze17tSe#r~in$|=$2;~%B!2Ueh&!K;XflQn{G-wq
K{BD4sL;nCsZEHdR

delta 361
zcmbO%^n!ywz%j^BltolQRK*8Ku}tKbXJnbEFRjkNz`&)#1SCZm8Ch2`scmAE5oM7y
zWno~GlweF0WfBt+Tf``~YO*|&As+(+Q0oFl(G(w15ug!Q>{(ev*+khUPh=8T0Gc3R
z12Kn%Rlq2uYz9z7?EnkRw8^<llH41Z#C|ZUonlry!35N#BLg&$Nz%rYL4vI~IU_YU
zQIrK}gDMlyVpWmJFBlcX@wkSUpz$IM3}TCz#Y7k+WhB@N67y0LizN<je!(Qk$e1^o
ziKQ<9A{JkkT2!2wpQmJ{pk$zDWTI!JRLdZtRFGIySeja*n_N(!09B!2sAsBYs2l9(
V7!d5?A0i3z01z=e2KpNmz5p+6N@V~5

diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 20e9f41de63..8550bc91253 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -2824,13 +2824,7 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type):
     ]
     pa_chunk_array = pa.chunked_array(np_list_data)
 
-    expect = pd.Series(pa_chunk_array.to_pandas())
-    if cudf.api.types.is_datetime64_dtype(
-        data_type
-    ) or cudf.api.types.is_timedelta64_dtype(data_type):
-        # Workaround for an Arrow Bug:
-        # https://github.com/apache/arrow/issues/34462
-        expect = expect.astype(data_type)
+    expect = pa_chunk_array.to_pandas()
     got = cudf.Series(pa_chunk_array)
 
     assert_eq(expect, got)
@@ -2845,12 +2839,6 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type):
     )
 
     expect = pa_table.to_pandas()
-    if cudf.api.types.is_datetime64_dtype(
-        data_type
-    ) or cudf.api.types.is_timedelta64_dtype(data_type):
-        # Workaround for an Arrow Bug:
-        # https://github.com/apache/arrow/issues/34462
-        expect = expect.astype(data_type)
     got = cudf.DataFrame.from_arrow(pa_table)
 
     assert_eq(expect, got)
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 0b252cec4b8..3cc6bfdbdc2 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -1523,13 +1523,7 @@ def test_index_from_arrow(data):
     arrow_array = pa.Array.from_pandas(pdi)
     expected_index = pd.Index(arrow_array.to_pandas())
     gdi = cudf.Index.from_arrow(arrow_array)
-    if gdi.dtype == cudf.dtype("datetime64[s]"):
-        # Arrow bug:
-        # https://github.com/apache/arrow/issues/33321
-        # arrow cannot convert non-nanosecond
-        # resolution to appropriate type in pandas.
-        # Hence need to type-cast.
-        expected_index = expected_index.astype(gdi.dtype)
+
     assert_eq(expected_index, gdi)
 
 
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 1e175f5ff0d..cf3c0e7f7a0 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -472,9 +472,7 @@ def test_parquet_read_filtered(tmpdir, rdg_seed):
     # Because of this, we aren't using PyArrow as a reference for testing our
     # row-group selection method since the only way to only select row groups
     # with PyArrow is with the method we use and intend to test.
-    tbl_filtered = pq.read_table(
-        fname, filters=[("1", ">", 60)], use_legacy_dataset=False
-    )
+    tbl_filtered = pq.read_table(fname, filters=[("1", ">", 60)])
 
     assert_eq(cudf.io.read_parquet_metadata(fname)[1], 2048 / 64)
     print(len(df_filtered))
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 6bd7558d322..9c7c687a6ed 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -247,7 +247,8 @@
     File name to use for partitioned datasets. Different partitions
     will be written to different directories, but all files will
     have this name.  If nothing is specified, a random uuid4 hex string
-    will be used for each file.
+    will be used for each file. This parameter is only supported by 'cudf'
+    engine, and will be ignored by other engines.
 partition_offsets : list, optional, default None
     Offsets to partition the dataframe by. Should be used when path is list
     of str. Should be a list of integers of size ``len(path) + 1``
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index fc3a243572f..4b57bcd018a 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -7,7 +7,7 @@ requires = [
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",
-    "pyarrow==14.0.2.*",
+    "pyarrow==16.0.0.*",
     "rmm==24.6.*",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -34,7 +34,7 @@ dependencies = [
     "packaging",
     "pandas>=2.0,<2.2.3dev0",
     "ptxcompiler",
-    "pyarrow>=14.0.1,<15.0.0a0",
+    "pyarrow>=16.0.0,<17.0.0a0",
     "rich",
     "rmm==24.6.*",
     "typing_extensions>=4.0.0",
@@ -63,7 +63,7 @@ test = [
     "pytest<8",
     "scipy",
     "tokenizers==0.15.2",
-    "transformers==4.38.1",
+    "transformers==4.39.3",
     "tzdata",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 pandas-tests = [
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index eb48852202a..787dd8a97d7 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -7,7 +7,7 @@ requires = [
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",
-    "pyarrow==14.0.2.*",
+    "pyarrow==16.0.0.*",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 2c44f192612..39800145585 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -166,9 +166,7 @@ def test_dask_timeseries_from_pandas(tmpdir):
     pdf = ddf2.compute()
     pdf.to_parquet(fn, engine="pyarrow")
     read_df = dask_cudf.read_parquet(fn)
-    # Workaround until following issue is fixed:
-    # https://github.com/apache/arrow/issues/33321
-    dd.assert_eq(ddf2, read_df.compute(), check_index_type=False)
+    dd.assert_eq(ddf2, read_df.compute())
 
 
 @pytest.mark.parametrize("index", [False, None])

From bd93e203b0bdfaa2b736a385ea7595c904bd30d8 Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Thu, 9 May 2024 15:23:21 -0400
Subject: [PATCH 199/272] Overhaul ops-codeowners coverage (#15660)

This PR overhauls how `ops-codeowners` reviews are handled.

`ops-codeowners` is replaced by `ci-codeowners` &
`packaging-codeowners`. The coverage of files is expanded as well.

Additionally, the process will change: reviews will be assigned to a
member of the teams instead of a manual request to `ops-codeowners`.
---
 .github/CODEOWNERS | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 31cfeaf4ca3..9efac3f1904 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -16,8 +16,14 @@ cpp/libcudf_kafka/CMakeLists.txt @rapidsai/cudf-cmake-codeowners
 #java code owners
 java/              @rapidsai/cudf-java-codeowners
 
-#build/ops code owners
-.github/           @rapidsai/ops-codeowners
-/ci/               @rapidsai/ops-codeowners
-conda/             @rapidsai/ops-codeowners
-dependencies.yaml  @rapidsai/ops-codeowners
+#CI code owners
+/.github/                @rapidsai/ci-codeowners
+/ci/                     @rapidsai/ci-codeowners
+/.pre-commit-config.yaml @rapidsai/ci-codeowners
+
+#packaging code owners
+/.devcontainers/   @rapidsai/packaging-codeowners
+/conda/            @rapidsai/packaging-codeowners
+/dependencies.yaml @rapidsai/packaging-codeowners
+/build.sh          @rapidsai/packaging-codeowners
+pyproject.toml     @rapidsai/packaging-codeowners

From 65a51ffa364b8a54fadab041cb5c563873303643 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 9 May 2024 19:19:45 -0400
Subject: [PATCH 200/272] Add large-strings gtest for cudf::interleave_columns
 (#15669)

Adds a gtest for `cudf::interleave_columns` that tests it can produce large-strings appropriately.
Follow on to #15544

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/15669
---
 cpp/tests/CMakeLists.txt                  |  8 ++-
 cpp/tests/large_strings/reshape_tests.cpp | 64 +++++++++++++++++++++++
 2 files changed, 70 insertions(+), 2 deletions(-)
 create mode 100644 cpp/tests/large_strings/reshape_tests.cpp

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index bbb919aa2d1..e779e1d1410 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -571,8 +571,12 @@ ConfigureTest(
 # ##################################################################################################
 # * large strings test ----------------------------------------------------------------------------
 ConfigureTest(
-  LARGE_STRINGS_TEST large_strings/large_strings_fixture.cpp large_strings/merge_tests.cpp
-  large_strings/concatenate_tests.cpp large_strings/parquet_tests.cpp
+  LARGE_STRINGS_TEST
+  large_strings/large_strings_fixture.cpp
+  large_strings/merge_tests.cpp
+  large_strings/concatenate_tests.cpp
+  large_strings/parquet_tests.cpp
+  large_strings/reshape_tests.cpp
   GPUS 1
   PERCENT 100
 )
diff --git a/cpp/tests/large_strings/reshape_tests.cpp b/cpp/tests/large_strings/reshape_tests.cpp
new file mode 100644
index 00000000000..b688a40a8d3
--- /dev/null
+++ b/cpp/tests/large_strings/reshape_tests.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+
+#include <cudf/copying.hpp>
+#include <cudf/reshape.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <vector>
+
+struct ReshapeTest : public cudf::test::StringsLargeTest {};
+
+TEST_F(ReshapeTest, InterleaveLargeStrings)
+{
+  auto const input = this->long_column();
+  auto input_views = std::vector<cudf::table_view>();
+  auto const view  = cudf::table_view({input});
+  std::vector<cudf::size_type> splits;
+  int const multiplier = 10;
+  for (int i = 0; i < multiplier; ++i) {  // 2500MB > 2GB
+    input_views.push_back(view);
+    splits.push_back(view.num_rows() * (i + 1));
+  }
+  splits.pop_back();  // remove last entry
+
+  auto result = cudf::interleave_columns(input_views);
+  auto sv     = cudf::strings_column_view(result->view());
+  EXPECT_EQ(sv.size(), view.num_rows() * multiplier);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+
+  auto sliced = cudf::split(sv.parent(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+
+  // also check regular sizes returns 32-bit offsets
+  input_views.clear();
+  input_views.push_back(view);
+  input_views.push_back(view);
+  result = cudf::interleave_columns(input_views);
+  sv     = cudf::strings_column_view(result->view());
+  EXPECT_EQ(sv.size(), view.num_rows() * 2);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT32});
+  sliced = cudf::split(sv.parent(), {view.num_rows()});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[0], input);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[1], input);
+}

From e1c6dc2848984279e6c5422496390c6396e74b6c Mon Sep 17 00:00:00 2001
From: Alfred Xu <lovedreamf@gmail.com>
Date: Fri, 10 May 2024 22:37:21 +0800
Subject: [PATCH 201/272] Refine `CudaTest.testCudaException` in case throwing
 wrong type of CudaError under aarch64 (#15706)

Fix #15705

1. Replacing  `Cuda.memset(Long.MAX_VALUE, (byte) 0, 1024)` with `Cuda.freePinned(-1L)`, the previous one throws fatal CUDAError `cudaErrorIllegalAddress`  instead of nonFatal CUDAError `cudaErrorInvalidValue` under aarch64, while the later one throwing the correct kind of error.

2. Enable the test case when Sanitizer is ON

Authors:
  - Alfred Xu (https://github.com/sperlingxx)

Approvers:
  - Tim Liu (https://github.com/NvTimLiu)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/15706
---
 java/src/test/java/ai/rapids/cudf/CudaTest.java | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/java/src/test/java/ai/rapids/cudf/CudaTest.java b/java/src/test/java/ai/rapids/cudf/CudaTest.java
index 2edd7f36cb7..9aaa9cee916 100644
--- a/java/src/test/java/ai/rapids/cudf/CudaTest.java
+++ b/java/src/test/java/ai/rapids/cudf/CudaTest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,6 @@
 
 package ai.rapids.cudf;
 
-import org.junit.jupiter.api.Tag;
 import org.junit.jupiter.api.Test;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -34,13 +33,13 @@ public void testGetCudaRuntimeInfo() {
     assertEquals(Cuda.getNativeComputeMode(), Cuda.getComputeMode().nativeId);
   }
 
-  @Tag("noSanitizer")
   @Test
   public void testCudaException() {
     assertThrows(CudaException.class, () -> {
           try {
-            Cuda.memset(Long.MAX_VALUE, (byte) 0, 1024);
-          } catch (CudaFatalException ignored) {
+            Cuda.freePinned(-1L);
+          } catch (CudaFatalException fatalEx) {
+            throw new AssertionError("Expected UnFatalError but got FatalError: " + fatalEx);
           } catch (CudaException ex) {
             assertEquals(CudaException.CudaError.cudaErrorInvalidValue, ex.getCudaError());
             throw ex;

From e93782f9d579701a628d8fb20f3d89f8c086fdc4 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Fri, 10 May 2024 11:26:15 -0500
Subject: [PATCH 202/272] Fix maxima of categorical column (#15701)

Closes https://github.com/rapidsai/cudf/issues/15641

Applies patch suggested by @wence-

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15701
---
 python/cudf/cudf/core/column/categorical.py | 23 +++++++++++++++++++++
 python/cudf/cudf/tests/test_categorical.py  | 19 +++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index dc51cd4f28f..1f003534913 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -515,6 +515,10 @@ class CategoricalColumn(column.ColumnBase):
     dtype: cudf.core.dtypes.CategoricalDtype
     _codes: Optional[NumericalColumn]
     _children: Tuple[NumericalColumn]
+    _VALID_REDUCTIONS = {
+        "max",
+        "min",
+    }
     _VALID_BINARY_OPERATIONS = {
         "__eq__",
         "__ne__",
@@ -699,6 +703,25 @@ def slice(
             ),
         )
 
+    def _reduce(
+        self,
+        op: str,
+        skipna: Optional[bool] = None,
+        min_count: int = 0,
+        *args,
+        **kwargs,
+    ) -> ScalarLike:
+        # Only valid reductions are min and max
+        if not self.ordered:
+            raise TypeError(
+                f"Categorical is not ordered for operation {op} "
+                "you can use .as_ordered() to change the Categorical "
+                "to an ordered one."
+            )
+        return self._decode(
+            self.codes._reduce(op, skipna, min_count, *args, **kwargs)
+        )
+
     def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         other = self._wrap_binop_normalization(other)
         # TODO: This is currently just here to make mypy happy, but eventually
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index 07ce81e3c39..c36595192e4 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -875,3 +875,22 @@ def test_cat_groupby_fillna():
         lfunc_args_and_kwargs=(("d",), {}),
         rfunc_args_and_kwargs=(("d",), {}),
     )
+
+
+@pytest.mark.parametrize("op", ["min", "max"])
+def test_categorical_maxima(op):
+    ser = cudf.Series(
+        ["a", "d", "c", "z", "g"],
+        dtype=cudf.CategoricalDtype(["z", "c", "g", "d", "a"], ordered=False),
+    )
+    assert not ser.cat.ordered
+
+    # Cannot get extrema of unordered Categorical column
+    with pytest.raises(TypeError, match="Categorical is not ordered"):
+        getattr(ser, op)()
+
+    # Max/min should work after converting to "ordered"
+    ser_pd = ser.to_pandas()
+    result = getattr(ser.cat.as_ordered(), op)()
+    result_pd = getattr(ser_pd.cat.as_ordered(), op)()
+    assert_eq(result, result_pd)

From b810113d6255dbe123aafbc80018bf6165a0842f Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 10 May 2024 10:02:48 -0700
Subject: [PATCH 203/272] Clean up join benchmarks (#15644)

This PR cleans up the join benchmark implementations. It uses nvbench helpers to simplify the code and reduces the number of test cases.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15644
---
 cpp/benchmarks/join/conditional_join.cu | 186 ++++++++---------
 cpp/benchmarks/join/distinct_join.cu    |  83 ++------
 cpp/benchmarks/join/join.cu             | 161 +++------------
 cpp/benchmarks/join/join_common.hpp     | 133 ++++++------
 cpp/benchmarks/join/left_join.cu        |  60 +++---
 cpp/benchmarks/join/mixed_join.cu       | 262 +++++-------------------
 6 files changed, 264 insertions(+), 621 deletions(-)

diff --git a/cpp/benchmarks/join/conditional_join.cu b/cpp/benchmarks/join/conditional_join.cu
index d721de0e8fd..d95fc0a5b59 100644
--- a/cpp/benchmarks/join/conditional_join.cu
+++ b/cpp/benchmarks/join/conditional_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,126 +16,102 @@
 
 #include <benchmarks/join/join_common.hpp>
 
-template <typename key_type, typename payload_type>
+template <typename Key>
 class ConditionalJoin : public cudf::benchmark {};
 
 // For compatibility with the shared logic for equality (hash) joins, all of
 // the join lambdas defined by these macros accept a null_equality parameter
 // but ignore it (don't forward it to the underlying join implementation)
 // because conditional joins do not use this parameter.
-#define CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type)            \
-  (::benchmark::State & st)                                                             \
-  {                                                                                     \
-    auto join = [](cudf::table_view const& left,                                        \
-                   cudf::table_view const& right,                                       \
-                   cudf::ast::operation binary_pred,                                    \
-                   cudf::null_equality compare_nulls) {                                 \
-      return cudf::conditional_inner_join(left, right, binary_pred);                    \
-    };                                                                                  \
-    BM_join<key_type, payload_type, nullable, join_t::CONDITIONAL>(st, join);           \
+#define CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key)            \
+  (::benchmark::State & st)                                          \
+  {                                                                  \
+    auto join = [](cudf::table_view const& left,                     \
+                   cudf::table_view const& right,                    \
+                   cudf::ast::operation binary_pred,                 \
+                   cudf::null_equality compare_nulls) {              \
+      return cudf::conditional_inner_join(left, right, binary_pred); \
+    };                                                               \
+    BM_join<Key, Nullable, join_t::CONDITIONAL>(st, join);           \
   }
 
-CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_32bit, int32_t, int32_t, false);
-CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit, int64_t, int64_t, false);
-CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_32bit_nulls, int32_t, int32_t, true);
-CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit_nulls, int64_t, int64_t, true);
-
-#define CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type)           \
-  (::benchmark::State & st)                                                            \
-  {                                                                                    \
-    auto join = [](cudf::table_view const& left,                                       \
-                   cudf::table_view const& right,                                      \
-                   cudf::ast::operation binary_pred,                                   \
-                   cudf::null_equality compare_nulls) {                                \
-      return cudf::conditional_left_join(left, right, binary_pred);                    \
-    };                                                                                 \
-    BM_join<key_type, payload_type, nullable, join_t::CONDITIONAL>(st, join);          \
+CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_32bit, int32_t, false);
+CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit, int64_t, false);
+CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_32bit_nulls, int32_t, true);
+CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit_nulls, int64_t, true);
+
+#define CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key)           \
+  (::benchmark::State & st)                                         \
+  {                                                                 \
+    auto join = [](cudf::table_view const& left,                    \
+                   cudf::table_view const& right,                   \
+                   cudf::ast::operation binary_pred,                \
+                   cudf::null_equality compare_nulls) {             \
+      return cudf::conditional_left_join(left, right, binary_pred); \
+    };                                                              \
+    BM_join<Key, Nullable, join_t::CONDITIONAL>(st, join);          \
   }
 
-CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_32bit, int32_t, int32_t, false);
-CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit, int64_t, int64_t, false);
-CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_32bit_nulls, int32_t, int32_t, true);
-CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit_nulls, int64_t, int64_t, true);
-
-#define CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type)           \
-  (::benchmark::State & st)                                                            \
-  {                                                                                    \
-    auto join = [](cudf::table_view const& left,                                       \
-                   cudf::table_view const& right,                                      \
-                   cudf::ast::operation binary_pred,                                   \
-                   cudf::null_equality compare_nulls) {                                \
-      return cudf::conditional_full_join(left, right, binary_pred);                    \
-    };                                                                                 \
-    BM_join<key_type, payload_type, nullable, join_t::CONDITIONAL>(st, join);          \
+CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_32bit, int32_t, false);
+CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit, int64_t, false);
+CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_32bit_nulls, int32_t, true);
+CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit_nulls, int64_t, true);
+
+#define CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key)           \
+  (::benchmark::State & st)                                         \
+  {                                                                 \
+    auto join = [](cudf::table_view const& left,                    \
+                   cudf::table_view const& right,                   \
+                   cudf::ast::operation binary_pred,                \
+                   cudf::null_equality compare_nulls) {             \
+      return cudf::conditional_full_join(left, right, binary_pred); \
+    };                                                              \
+    BM_join<Key, Nullable, join_t::CONDITIONAL>(st, join);          \
   }
 
-CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_32bit, int32_t, int32_t, false);
-CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit, int64_t, int64_t, false);
-CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_32bit_nulls, int32_t, int32_t, true);
-CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit_nulls, int64_t, int64_t, true);
-
-#define CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type)                \
-  (::benchmark::State & st)                                                                 \
-  {                                                                                         \
-    auto join = [](cudf::table_view const& left,                                            \
-                   cudf::table_view const& right,                                           \
-                   cudf::ast::operation binary_pred,                                        \
-                   cudf::null_equality compare_nulls) {                                     \
-      return cudf::conditional_left_anti_join(left, right, binary_pred);                    \
-    };                                                                                      \
-    BM_join<key_type, payload_type, nullable, join_t::CONDITIONAL>(st, join);               \
+CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_32bit, int32_t, false);
+CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit, int64_t, false);
+CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_32bit_nulls, int32_t, true);
+CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit_nulls, int64_t, true);
+
+#define CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key)                \
+  (::benchmark::State & st)                                              \
+  {                                                                      \
+    auto join = [](cudf::table_view const& left,                         \
+                   cudf::table_view const& right,                        \
+                   cudf::ast::operation binary_pred,                     \
+                   cudf::null_equality compare_nulls) {                  \
+      return cudf::conditional_left_anti_join(left, right, binary_pred); \
+    };                                                                   \
+    BM_join<Key, Nullable, join_t::CONDITIONAL>(st, join);               \
   }
 
-CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_32bit,
-                                            int32_t,
-                                            int32_t,
-                                            false);
-CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit,
-                                            int64_t,
-                                            int64_t,
-                                            false);
-CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_32bit_nulls,
-                                            int32_t,
-                                            int32_t,
-                                            true);
-CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit_nulls,
-                                            int64_t,
-                                            int64_t,
-                                            true);
-
-#define CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type)                \
-  (::benchmark::State & st)                                                                 \
-  {                                                                                         \
-    auto join = [](cudf::table_view const& left,                                            \
-                   cudf::table_view const& right,                                           \
-                   cudf::ast::operation binary_pred,                                        \
-                   cudf::null_equality compare_nulls) {                                     \
-      return cudf::conditional_left_semi_join(left, right, binary_pred);                    \
-    };                                                                                      \
-    BM_join<key_type, payload_type, nullable, join_t::CONDITIONAL>(st, join);               \
+CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_32bit, int32_t, false);
+CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit, int64_t, false);
+CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_32bit_nulls, int32_t, true);
+CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit_nulls, int64_t, true);
+
+#define CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key)                \
+  (::benchmark::State & st)                                              \
+  {                                                                      \
+    auto join = [](cudf::table_view const& left,                         \
+                   cudf::table_view const& right,                        \
+                   cudf::ast::operation binary_pred,                     \
+                   cudf::null_equality compare_nulls) {                  \
+      return cudf::conditional_left_semi_join(left, right, binary_pred); \
+    };                                                                   \
+    BM_join<Key, Nullable, join_t::CONDITIONAL>(st, join);               \
   }
 
-CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_32bit,
-                                            int32_t,
-                                            int32_t,
-                                            false);
-CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_64bit,
-                                            int64_t,
-                                            int64_t,
-                                            false);
-CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_32bit_nulls,
-                                            int32_t,
-                                            int32_t,
-                                            true);
-CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_64bit_nulls,
-                                            int64_t,
-                                            int64_t,
-                                            true);
+CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_32bit, int32_t, false);
+CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_64bit, int64_t, false);
+CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_32bit_nulls, int32_t, true);
+CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_64bit_nulls, int64_t, true);
 
 // inner join -----------------------------------------------------------------------
 BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_32bit)
diff --git a/cpp/benchmarks/join/distinct_join.cu b/cpp/benchmarks/join/distinct_join.cu
index 4a68ee3878e..af8fa1f9d94 100644
--- a/cpp/benchmarks/join/distinct_join.cu
+++ b/cpp/benchmarks/join/distinct_join.cu
@@ -16,12 +16,10 @@
 
 #include "join_common.hpp"
 
-template <typename key_type, typename payload_type, bool Nullable>
+template <typename Key, bool Nullable>
 void distinct_inner_join(nvbench::state& state,
-                         nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+                         nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& build_input,
                  cudf::table_view const& probe_input,
                  cudf::null_equality compare_nulls,
@@ -35,15 +33,13 @@ void distinct_inner_join(nvbench::state& state,
     return hj_obj.inner_join(stream);
   };
 
-  BM_join<key_type, payload_type, Nullable>(state, join);
+  BM_join<Key, Nullable>(state, join);
 }
 
-template <typename key_type, typename payload_type, bool Nullable>
+template <typename Key, bool Nullable>
 void distinct_left_join(nvbench::state& state,
-                        nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+                        nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& build_input,
                  cudf::table_view const& probe_input,
                  cudf::null_equality compare_nulls,
@@ -57,65 +53,18 @@ void distinct_left_join(nvbench::state& state,
     return hj_obj.left_join(stream);
   };
 
-  BM_join<key_type, payload_type, Nullable>(state, join);
+  BM_join<Key, Nullable>(state, join);
 }
 
-// inner join -----------------------------------------------------------------------
 NVBENCH_BENCH_TYPES(distinct_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("distinct_inner_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(distinct_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("distinct_inner_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-NVBENCH_BENCH_TYPES(distinct_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("distinct_inner_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(distinct_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("distinct_inner_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-// left join ------------------------------------------------------------------------
-NVBENCH_BENCH_TYPES(distinct_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("distinct_left_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
+                    NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("distinct_inner_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
 
-NVBENCH_BENCH_TYPES(distinct_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("distinct_left_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
+NVBENCH_BENCH_TYPES(distinct_left_join, NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("distinct_left_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
diff --git a/cpp/benchmarks/join/join.cu b/cpp/benchmarks/join/join.cu
index 1c02a4488ac..c4a39da4662 100644
--- a/cpp/benchmarks/join/join.cu
+++ b/cpp/benchmarks/join/join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,12 +16,10 @@
 
 #include <benchmarks/join/join_common.hpp>
 
-template <typename key_type, typename payload_type, bool Nullable>
+template <typename Key, bool Nullable>
 void nvbench_inner_join(nvbench::state& state,
-                        nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+                        nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& left_input,
                  cudf::table_view const& right_input,
                  cudf::null_equality compare_nulls,
@@ -33,15 +31,12 @@ void nvbench_inner_join(nvbench::state& state,
     return hj_obj.inner_join(right_input, std::nullopt, stream);
   };
 
-  BM_join<key_type, payload_type, Nullable>(state, join);
+  BM_join<Key, Nullable>(state, join);
 }
 
-template <typename key_type, typename payload_type, bool Nullable>
-void nvbench_left_join(nvbench::state& state,
-                       nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+template <typename Key, bool Nullable>
+void nvbench_left_join(nvbench::state& state, nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& left_input,
                  cudf::table_view const& right_input,
                  cudf::null_equality compare_nulls,
@@ -53,15 +48,12 @@ void nvbench_left_join(nvbench::state& state,
     return hj_obj.left_join(right_input, std::nullopt, stream);
   };
 
-  BM_join<key_type, payload_type, Nullable>(state, join);
+  BM_join<Key, Nullable>(state, join);
 }
 
-template <typename key_type, typename payload_type, bool Nullable>
-void nvbench_full_join(nvbench::state& state,
-                       nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+template <typename Key, bool Nullable>
+void nvbench_full_join(nvbench::state& state, nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& left_input,
                  cudf::table_view const& right_input,
                  cudf::null_equality compare_nulls,
@@ -73,122 +65,23 @@ void nvbench_full_join(nvbench::state& state,
     return hj_obj.full_join(right_input, std::nullopt, stream);
   };
 
-  BM_join<key_type, payload_type, Nullable>(state, join);
+  BM_join<Key, Nullable>(state, join);
 }
 
-// inner join -----------------------------------------------------------------------
-NVBENCH_BENCH_TYPES(nvbench_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("inner_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("inner_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("inner_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("inner_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-// left join ------------------------------------------------------------------------
-NVBENCH_BENCH_TYPES(nvbench_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("left_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("left_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("left_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("left_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-// full join ------------------------------------------------------------------------
-NVBENCH_BENCH_TYPES(nvbench_full_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("full_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_full_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("full_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_full_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("full_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_full_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("full_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
+NVBENCH_BENCH_TYPES(nvbench_inner_join, NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("inner_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+
+NVBENCH_BENCH_TYPES(nvbench_left_join, NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("left_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+
+NVBENCH_BENCH_TYPES(nvbench_full_join, NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("full_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp
index 9f869ddb1ac..9e23d28b363 100644
--- a/cpp/benchmarks/join/join_common.hpp
+++ b/cpp/benchmarks/join/join_common.hpp
@@ -41,6 +41,11 @@
 
 #include <vector>
 
+using JOIN_KEY_TYPE_RANGE = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
+using JOIN_NULLABLE_RANGE = nvbench::enum_type_list<false, true>;
+
+auto const JOIN_SIZE_RANGE = std::vector<nvbench::int64_t>{1000, 100'000, 10'000'000};
+
 struct null75_generator {
   thrust::minstd_rand engine;
   thrust::uniform_int_distribution<unsigned> rand_gen;
@@ -55,52 +60,42 @@ struct null75_generator {
 
 enum class join_t { CONDITIONAL, MIXED, HASH };
 
-inline void skip_helper(nvbench::state& state)
-{
-  auto const build_table_size = state.get_int64("Build Table Size");
-  auto const probe_table_size = state.get_int64("Probe Table Size");
-
-  if (build_table_size > probe_table_size) {
-    state.skip("Large build tables are skipped.");
-    return;
-  }
-
-  if (build_table_size * 100 <= probe_table_size) {
-    state.skip("Large probe tables are skipped.");
-    return;
-  }
-}
-
-template <typename key_type,
-          typename payload_type,
+template <typename Key,
           bool Nullable,
           join_t join_type = join_t::HASH,
           typename state_type,
           typename Join>
 void BM_join(state_type& state, Join JoinFunc)
 {
-  auto const build_table_size = [&]() {
+  auto const right_size = [&]() {
     if constexpr (std::is_same_v<state_type, benchmark::State>) {
       return static_cast<cudf::size_type>(state.range(0));
     }
     if constexpr (std::is_same_v<state_type, nvbench::state>) {
-      return static_cast<cudf::size_type>(state.get_int64("Build Table Size"));
+      return static_cast<cudf::size_type>(state.get_int64("right_size"));
     }
   }();
-  auto const probe_table_size = [&]() {
+  auto const left_size = [&]() {
     if constexpr (std::is_same_v<state_type, benchmark::State>) {
       return static_cast<cudf::size_type>(state.range(1));
     }
     if constexpr (std::is_same_v<state_type, nvbench::state>) {
-      return static_cast<cudf::size_type>(state.get_int64("Probe Table Size"));
+      return static_cast<cudf::size_type>(state.get_int64("left_size"));
     }
   }();
 
+  if constexpr (std::is_same_v<state_type, nvbench::state>) {
+    if (right_size > left_size) {
+      state.skip("Skip large right table");
+      return;
+    }
+  }
+
   double const selectivity = 0.3;
   int const multiplicity   = 1;
 
   // Generate build and probe tables
-  auto build_random_null_mask = [](int size) {
+  auto right_random_null_mask = [](int size) {
     // roughly 75% nulls
     auto validity =
       thrust::make_transform_iterator(thrust::make_counting_iterator(0), null75_generator{});
@@ -111,62 +106,62 @@ void BM_join(state_type& state, Join JoinFunc)
                                   rmm::mr::get_current_device_resource());
   };
 
-  std::unique_ptr<cudf::column> build_key_column0 = [&]() {
-    auto [null_mask, null_count] = build_random_null_mask(build_table_size);
-    return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
-                                                build_table_size,
-                                                std::move(null_mask),
-                                                null_count)
-                    : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
-                                                build_table_size);
+  std::unique_ptr<cudf::column> right_key_column0 = [&]() {
+    auto [null_mask, null_count] = right_random_null_mask(right_size);
+    return Nullable
+             ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<Key>()),
+                                         right_size,
+                                         std::move(null_mask),
+                                         null_count)
+             : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<Key>()), right_size);
   }();
-  std::unique_ptr<cudf::column> probe_key_column0 = [&]() {
-    auto [null_mask, null_count] = build_random_null_mask(probe_table_size);
-    return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
-                                                probe_table_size,
-                                                std::move(null_mask),
-                                                null_count)
-                    : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
-                                                probe_table_size);
+  std::unique_ptr<cudf::column> left_key_column0 = [&]() {
+    auto [null_mask, null_count] = right_random_null_mask(left_size);
+    return Nullable
+             ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<Key>()),
+                                         left_size,
+                                         std::move(null_mask),
+                                         null_count)
+             : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<Key>()), left_size);
   }();
 
-  generate_input_tables<key_type, cudf::size_type>(
-    build_key_column0->mutable_view().data<key_type>(),
-    build_table_size,
-    probe_key_column0->mutable_view().data<key_type>(),
-    probe_table_size,
-    selectivity,
-    multiplicity);
+  // build table is right table, probe table is left table
+  generate_input_tables<Key, cudf::size_type>(right_key_column0->mutable_view().data<Key>(),
+                                              right_size,
+                                              left_key_column0->mutable_view().data<Key>(),
+                                              left_size,
+                                              selectivity,
+                                              multiplicity);
 
-  // Copy build_key_column0 and probe_key_column0 into new columns.
+  // Copy right_key_column0 and left_key_column0 into new columns.
   // If Nullable, the new columns will be assigned new nullmasks.
-  auto const build_key_column1 = [&]() {
-    auto col = std::make_unique<cudf::column>(build_key_column0->view());
+  auto const right_key_column1 = [&]() {
+    auto col = std::make_unique<cudf::column>(right_key_column0->view());
     if (Nullable) {
-      auto [null_mask, null_count] = build_random_null_mask(build_table_size);
+      auto [null_mask, null_count] = right_random_null_mask(right_size);
       col->set_null_mask(std::move(null_mask), null_count);
     }
     return col;
   }();
-  auto const probe_key_column1 = [&]() {
-    auto col = std::make_unique<cudf::column>(probe_key_column0->view());
+  auto const left_key_column1 = [&]() {
+    auto col = std::make_unique<cudf::column>(left_key_column0->view());
     if (Nullable) {
-      auto [null_mask, null_count] = build_random_null_mask(probe_table_size);
+      auto [null_mask, null_count] = right_random_null_mask(left_size);
       col->set_null_mask(std::move(null_mask), null_count);
     }
     return col;
   }();
 
-  auto init = cudf::make_fixed_width_scalar<payload_type>(static_cast<payload_type>(0));
-  auto build_payload_column = cudf::sequence(build_table_size, *init);
-  auto probe_payload_column = cudf::sequence(probe_table_size, *init);
+  auto init                 = cudf::make_fixed_width_scalar<Key>(static_cast<Key>(0));
+  auto right_payload_column = cudf::sequence(right_size, *init);
+  auto left_payload_column  = cudf::sequence(left_size, *init);
 
   CUDF_CHECK_CUDA(0);
 
-  cudf::table_view build_table(
-    {build_key_column0->view(), build_key_column1->view(), *build_payload_column});
-  cudf::table_view probe_table(
-    {probe_key_column0->view(), probe_key_column1->view(), *probe_payload_column});
+  cudf::table_view right_table(
+    {right_key_column0->view(), right_key_column1->view(), *right_payload_column});
+  cudf::table_view left_table(
+    {left_key_column0->view(), left_key_column1->view(), *left_payload_column});
 
   // Setup join parameters and result table
   [[maybe_unused]] std::vector<cudf::size_type> columns_to_join = {0};
@@ -177,8 +172,8 @@ void BM_join(state_type& state, Join JoinFunc)
     for (auto _ : state) {
       cuda_event_timer raii(state, true, cudf::get_default_stream());
 
-      auto result = JoinFunc(probe_table.select(columns_to_join),
-                             build_table.select(columns_to_join),
+      auto result = JoinFunc(left_table.select(columns_to_join),
+                             right_table.select(columns_to_join),
                              cudf::null_equality::UNEQUAL);
     }
   }
@@ -191,10 +186,10 @@ void BM_join(state_type& state, Join JoinFunc)
         cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);
       state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
         rmm::cuda_stream_view stream_view{launch.get_stream()};
-        auto result = JoinFunc(probe_table.select(columns_to_join),
-                               build_table.select(columns_to_join),
-                               probe_table.select({1}),
-                               build_table.select({1}),
+        auto result = JoinFunc(left_table.select(columns_to_join),
+                               right_table.select(columns_to_join),
+                               left_table.select({1}),
+                               right_table.select({1}),
                                left_zero_eq_right_zero,
                                cudf::null_equality::UNEQUAL,
                                stream_view);
@@ -203,8 +198,8 @@ void BM_join(state_type& state, Join JoinFunc)
     if constexpr (join_type == join_t::HASH) {
       state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
         rmm::cuda_stream_view stream_view{launch.get_stream()};
-        auto result = JoinFunc(probe_table.select(columns_to_join),
-                               build_table.select(columns_to_join),
+        auto result = JoinFunc(left_table.select(columns_to_join),
+                               right_table.select(columns_to_join),
                                cudf::null_equality::UNEQUAL,
                                stream_view);
       });
@@ -223,7 +218,7 @@ void BM_join(state_type& state, Join JoinFunc)
       cuda_event_timer raii(state, true, cudf::get_default_stream());
 
       auto result =
-        JoinFunc(probe_table, build_table, left_zero_eq_right_zero, cudf::null_equality::UNEQUAL);
+        JoinFunc(left_table, right_table, left_zero_eq_right_zero, cudf::null_equality::UNEQUAL);
     }
   }
 }
diff --git a/cpp/benchmarks/join/left_join.cu b/cpp/benchmarks/join/left_join.cu
index 96bbd1bc58e..3e398e721fa 100644
--- a/cpp/benchmarks/join/left_join.cu
+++ b/cpp/benchmarks/join/left_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,42 +16,42 @@
 
 #include <benchmarks/join/join_common.hpp>
 
-template <typename key_type, typename payload_type>
+template <typename Key>
 class Join : public cudf::benchmark {};
 
-#define LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type)               \
-  (::benchmark::State & st)                                                     \
-  {                                                                             \
-    auto join = [](cudf::table_view const& left,                                \
-                   cudf::table_view const& right,                               \
-                   cudf::null_equality compare_nulls) {                         \
-      return cudf::left_anti_join(left, right, compare_nulls);                  \
-    };                                                                          \
-    BM_join<key_type, payload_type, nullable>(st, join);                        \
+#define LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, Key, Nullable)   \
+  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, Key)                 \
+  (::benchmark::State & st)                                    \
+  {                                                            \
+    auto join = [](cudf::table_view const& left,               \
+                   cudf::table_view const& right,              \
+                   cudf::null_equality compare_nulls) {        \
+      return cudf::left_anti_join(left, right, compare_nulls); \
+    };                                                         \
+    BM_join<Key, Nullable>(st, join);                          \
   }
 
-LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit, int32_t, int32_t, false);
-LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit, int64_t, int64_t, false);
-LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit_nulls, int32_t, int32_t, true);
-LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit_nulls, int64_t, int64_t, true);
+LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit, int32_t, false);
+LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit, int64_t, false);
+LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit_nulls, int32_t, true);
+LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit_nulls, int64_t, true);
 
-#define LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type)               \
-  (::benchmark::State & st)                                                     \
-  {                                                                             \
-    auto join = [](cudf::table_view const& left,                                \
-                   cudf::table_view const& right,                               \
-                   cudf::null_equality compare_nulls) {                         \
-      return cudf::left_semi_join(left, right, compare_nulls);                  \
-    };                                                                          \
-    BM_join<key_type, payload_type, nullable>(st, join);                        \
+#define LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, Key, Nullable)   \
+  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, Key)                 \
+  (::benchmark::State & st)                                    \
+  {                                                            \
+    auto join = [](cudf::table_view const& left,               \
+                   cudf::table_view const& right,              \
+                   cudf::null_equality compare_nulls) {        \
+      return cudf::left_semi_join(left, right, compare_nulls); \
+    };                                                         \
+    BM_join<Key, Nullable>(st, join);                          \
   }
 
-LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit, int32_t, int32_t, false);
-LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit, int64_t, int64_t, false);
-LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit_nulls, int32_t, int32_t, true);
-LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit_nulls, int64_t, int64_t, true);
+LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit, int32_t, false);
+LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit, int64_t, false);
+LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit_nulls, int32_t, true);
+LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit_nulls, int64_t, true);
 
 // left anti-join -------------------------------------------------------------
 BENCHMARK_REGISTER_F(Join, left_anti_join_32bit)
diff --git a/cpp/benchmarks/join/mixed_join.cu b/cpp/benchmarks/join/mixed_join.cu
index 67be4640f84..129ea62e7a6 100644
--- a/cpp/benchmarks/join/mixed_join.cu
+++ b/cpp/benchmarks/join/mixed_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,12 +16,10 @@
 
 #include <benchmarks/join/join_common.hpp>
 
-template <typename key_type, typename payload_type, bool Nullable>
-void nvbench_mixed_inner_join(
-  nvbench::state& state, nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+template <typename Key, bool Nullable>
+void nvbench_mixed_inner_join(nvbench::state& state,
+                              nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& left_equality_input,
                  cudf::table_view const& right_equality_input,
                  cudf::table_view const& left_conditional_input,
@@ -37,15 +35,13 @@ void nvbench_mixed_inner_join(
                                   compare_nulls);
   };
 
-  BM_join<key_type, payload_type, Nullable, join_t::MIXED>(state, join);
+  BM_join<Key, Nullable, join_t::MIXED>(state, join);
 }
 
-template <typename key_type, typename payload_type, bool Nullable>
-void nvbench_mixed_left_join(
-  nvbench::state& state, nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+template <typename Key, bool Nullable>
+void nvbench_mixed_left_join(nvbench::state& state,
+                             nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& left_equality_input,
                  cudf::table_view const& right_equality_input,
                  cudf::table_view const& left_conditional_input,
@@ -61,15 +57,13 @@ void nvbench_mixed_left_join(
                                  compare_nulls);
   };
 
-  BM_join<key_type, payload_type, Nullable, join_t::MIXED>(state, join);
+  BM_join<Key, Nullable, join_t::MIXED>(state, join);
 }
 
-template <typename key_type, typename payload_type, bool Nullable>
-void nvbench_mixed_full_join(
-  nvbench::state& state, nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+template <typename Key, bool Nullable>
+void nvbench_mixed_full_join(nvbench::state& state,
+                             nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& left_equality_input,
                  cudf::table_view const& right_equality_input,
                  cudf::table_view const& left_conditional_input,
@@ -85,15 +79,13 @@ void nvbench_mixed_full_join(
                                  compare_nulls);
   };
 
-  BM_join<key_type, payload_type, Nullable, join_t::MIXED>(state, join);
+  BM_join<Key, Nullable, join_t::MIXED>(state, join);
 }
 
-template <typename key_type, typename payload_type, bool Nullable>
-void nvbench_mixed_left_semi_join(
-  nvbench::state& state, nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+template <typename Key, bool Nullable>
+void nvbench_mixed_left_semi_join(nvbench::state& state,
+                                  nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& left_equality_input,
                  cudf::table_view const& right_equality_input,
                  cudf::table_view const& left_conditional_input,
@@ -109,15 +101,13 @@ void nvbench_mixed_left_semi_join(
                                       compare_nulls);
   };
 
-  BM_join<key_type, payload_type, Nullable, join_t::MIXED>(state, join);
+  BM_join<Key, Nullable, join_t::MIXED>(state, join);
 }
 
-template <typename key_type, typename payload_type, bool Nullable>
-void nvbench_mixed_left_anti_join(
-  nvbench::state& state, nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+template <typename Key, bool Nullable>
+void nvbench_mixed_left_anti_join(nvbench::state& state,
+                                  nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& left_equality_input,
                  cudf::table_view const& right_equality_input,
                  cudf::table_view const& left_conditional_input,
@@ -133,200 +123,40 @@ void nvbench_mixed_left_anti_join(
                                       compare_nulls);
   };
 
-  BM_join<key_type, payload_type, Nullable, join_t::MIXED>(state, join);
+  BM_join<Key, Nullable, join_t::MIXED>(state, join);
 }
 
-// inner join -----------------------------------------------------------------------
-NVBENCH_BENCH_TYPES(nvbench_mixed_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_inner_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_inner_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
 NVBENCH_BENCH_TYPES(nvbench_mixed_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_inner_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
+                    NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("mixed_inner_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
 
-NVBENCH_BENCH_TYPES(nvbench_mixed_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_inner_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-// left join ------------------------------------------------------------------------
 NVBENCH_BENCH_TYPES(nvbench_mixed_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_left_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
+                    NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("mixed_left_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
 
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_left_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_left_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_left_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-// full join ------------------------------------------------------------------------
 NVBENCH_BENCH_TYPES(nvbench_mixed_full_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_full_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_full_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_full_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_full_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_full_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_full_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_full_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-// left semi join ------------------------------------------------------------------------
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_semi_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_left_semi_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_semi_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_left_semi_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
+                    NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("mixed_full_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
 
 NVBENCH_BENCH_TYPES(nvbench_mixed_left_semi_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_left_semi_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_semi_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_left_semi_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-// left anti join ------------------------------------------------------------------------
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_anti_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_left_anti_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_anti_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_left_anti_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_anti_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_left_anti_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
+                    NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("mixed_left_semi_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
 
 NVBENCH_BENCH_TYPES(nvbench_mixed_left_anti_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_left_anti_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
+                    NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("mixed_left_anti_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);

From c3f34093ede9e20c5e2e008658097dffc99fe038 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 10 May 2024 15:19:45 -0500
Subject: [PATCH 204/272] Fix `get_loc` to properly fetch results from an index
 that is in decreasing order (#15719)

Fixes: #15713

This PR properly calls `search_sorted` to arrive at correct results for indexes that are of decreasing order.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15719
---
 python/cudf/cudf/core/index.py       | 10 ++++++++--
 python/cudf/cudf/tests/test_index.py |  7 ++++---
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 096b6f17c1d..0710f0f5c42 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -112,10 +112,16 @@ def _lexsorted_equal_range(
         sort_inds = None
         sort_vals = idx
     lower_bound = search_sorted(
-        [*sort_vals._data.columns], [*key_as_table._columns], side="left"
+        [*sort_vals._data.columns],
+        [*key_as_table._columns],
+        side="left",
+        ascending=sort_vals.is_monotonic_increasing,
     ).element_indexing(0)
     upper_bound = search_sorted(
-        [*sort_vals._data.columns], [*key_as_table._columns], side="right"
+        [*sort_vals._data.columns],
+        [*key_as_table._columns],
+        side="right",
+        ascending=sort_vals.is_monotonic_increasing,
     ).element_indexing(0)
 
     return lower_bound, upper_bound, sort_inds
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 3cc6bfdbdc2..8b7ee1dccf8 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -1806,12 +1806,13 @@ def test_get_loc_rangeindex(idx, key):
 @pytest.mark.parametrize(
     "idx",
     [
-        pd.Index([1, 3, 3, 6]),  # monotonic
+        pd.Index([1, 3, 3, 6]),  # monotonic increasing
         pd.Index([6, 1, 3, 3]),  # non-monotonic
+        pd.Index([4, 3, 2, 1, 0]),  # monotonic decreasing
     ],
 )
-@pytest.mark.parametrize("key", [0, 3, 6, 7])
-def test_get_loc_single_duplicate_numeric(idx, key):
+@pytest.mark.parametrize("key", [0, 3, 6, 7, 4])
+def test_get_loc_duplicate_numeric(idx, key):
     pi = idx
     gi = cudf.from_pandas(pi)
 

From b5a9c4b5114390fb45e27d9aab5eaa995de3fa37 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Fri, 10 May 2024 19:08:18 -0400
Subject: [PATCH 205/272] Fix multi-source reading in JSON byte range reader
 (#15671)

This PR fixes the number of bytes read and corrects the offsets for the delimiters added to the buffer when reading across multiple sources.

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15671
---
 cpp/src/io/json/read_json.cu         |  57 +++++++-------
 cpp/tests/io/json_chunked_reader.cpp | 110 ++++++++++++++++++++++++++-
 2 files changed, 135 insertions(+), 32 deletions(-)

diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 0ead5c56264..ea52dce020e 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -50,7 +50,10 @@ size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
 }
 
 /**
- * @brief Read from array of data sources into RMM buffer
+ * @brief Read from array of data sources into RMM buffer. The size of the returned device span
+          can be larger than the number of bytes requested from the list of sources when
+          the range to be read spans across multiple sources. This is due to the delimiter
+          characters inserted after the end of each accessed source.
  *
  * @param buffer Device span buffer to which data is read
  * @param sources Array of data sources
@@ -72,7 +75,6 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
   // line of file i+1 don't end up on the same JSON line, if file i does not already end with a line
   // delimiter.
   auto constexpr num_delimiter_chars = 1;
-  auto const num_extra_delimiters    = num_delimiter_chars * (sources.size() - 1);
 
   if (compression == compression_type::NONE) {
     std::vector<size_type> delimiter_map{};
@@ -89,28 +91,29 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
       std::upper_bound(prefsum_source_sizes.begin(), prefsum_source_sizes.end(), range_offset);
     size_t start_source = std::distance(prefsum_source_sizes.begin(), upper);
 
-    auto remaining_bytes_to_read = std::min(range_size, prefsum_source_sizes.back() - range_offset);
+    auto const total_bytes_to_read =
+      std::min(range_size, prefsum_source_sizes.back() - range_offset);
     range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0;
-    for (size_t i = start_source; i < sources.size() && remaining_bytes_to_read; i++) {
+    for (size_t i = start_source; i < sources.size() && bytes_read < total_bytes_to_read; i++) {
       if (sources[i]->is_empty()) continue;
-      auto data_size   = std::min(sources[i]->size() - range_offset, remaining_bytes_to_read);
-      auto destination = reinterpret_cast<uint8_t*>(buffer.data()) + bytes_read;
+      auto data_size =
+        std::min(sources[i]->size() - range_offset, total_bytes_to_read - bytes_read);
+      auto destination = reinterpret_cast<uint8_t*>(buffer.data()) + bytes_read +
+                         (num_delimiter_chars * delimiter_map.size());
       if (sources[i]->is_device_read_preferred(data_size)) {
         bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream);
       } else {
         h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size));
         auto const& h_buffer = h_buffers.back();
         CUDF_CUDA_TRY(cudaMemcpyAsync(
-          destination, h_buffer->data(), h_buffer->size(), cudaMemcpyDefault, stream.value()));
+          destination, h_buffer->data(), h_buffer->size(), cudaMemcpyHostToDevice, stream.value()));
         bytes_read += h_buffer->size();
       }
       range_offset = 0;
-      remaining_bytes_to_read -= bytes_read;
-      delimiter_map.push_back(bytes_read);
-      bytes_read += num_delimiter_chars;
+      delimiter_map.push_back(bytes_read + (num_delimiter_chars * delimiter_map.size()));
     }
-    // In the case where all sources are empty, bytes_read is zero
-    if (bytes_read) bytes_read -= num_delimiter_chars;
+    // Removing delimiter inserted after last non-empty source is read
+    if (!delimiter_map.empty()) { delimiter_map.pop_back(); }
 
     // If this is a multi-file source, we scatter the JSON line delimiters between files
     if (sources.size() > 1) {
@@ -118,9 +121,7 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
                     "Currently only single-character delimiters are supported");
       auto const delimiter_source = thrust::make_constant_iterator('\n');
       auto const d_delimiter_map  = cudf::detail::make_device_uvector_async(
-        host_span<size_type const>{delimiter_map.data(), delimiter_map.size() - 1},
-        stream,
-        rmm::mr::get_current_device_resource());
+        delimiter_map, stream, rmm::mr::get_current_device_resource());
       thrust::scatter(rmm::exec_policy_nosync(stream),
                       delimiter_source,
                       delimiter_source + d_delimiter_map.size(),
@@ -128,7 +129,7 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
                       buffer.data());
     }
     stream.synchronize();
-    return buffer.first(bytes_read);
+    return buffer.first(bytes_read + (delimiter_map.size() * num_delimiter_chars));
   }
   // TODO: allow byte range reading from multiple compressed files.
   auto remaining_bytes_to_read = std::min(range_size, sources[0]->size() - range_offset);
@@ -151,17 +152,15 @@ size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::data
                                         char const delimiter,
                                         rmm::cuda_stream_view stream)
 {
-  auto const total_source_size =
-    sources_size(sources, reader_opts.get_byte_range_offset(), reader_opts.get_byte_range_size()) +
-    (sources.size() - 1);
+  auto total_source_size = sources_size(sources, 0, 0) + (sources.size() - 1);
   rmm::device_uvector<char> buffer(total_source_size, stream);
-  ingest_raw_input(buffer,
-                   sources,
-                   reader_opts.get_compression(),
-                   reader_opts.get_byte_range_offset(),
-                   reader_opts.get_byte_range_size(),
-                   stream);
-  return find_first_delimiter(buffer, delimiter, stream);
+  auto readbufspan = ingest_raw_input(buffer,
+                                      sources,
+                                      reader_opts.get_compression(),
+                                      reader_opts.get_byte_range_offset(),
+                                      reader_opts.get_byte_range_size(),
+                                      stream);
+  return find_first_delimiter(readbufspan, '\n', stream);
 }
 
 /**
@@ -195,8 +194,7 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
   CUDF_EXPECTS(total_source_size ? chunk_offset < total_source_size : !chunk_offset,
                "Invalid offsetting");
   auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset;
-  chunk_size =
-    should_load_all_sources ? total_source_size - chunk_offset + num_extra_delimiters : chunk_size;
+  chunk_size = should_load_all_sources ? total_source_size - chunk_offset : chunk_size;
 
   // Some magic numbers
   constexpr int num_subchunks               = 10;  // per chunk_size
@@ -217,7 +215,8 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
   size_t const buffer_size =
     reader_compression != compression_type::NONE
       ? total_source_size * estimated_compression_ratio + header_size
-      : std::min(total_source_size, chunk_size + num_subchunks_prealloced * size_per_subchunk);
+      : std::min(total_source_size, chunk_size + num_subchunks_prealloced * size_per_subchunk) +
+          num_extra_delimiters;
   rmm::device_uvector<char> buffer(buffer_size, stream);
   device_span<char> bufspan(buffer);
 
diff --git a/cpp/tests/io/json_chunked_reader.cpp b/cpp/tests/io/json_chunked_reader.cpp
index ef69ee5239d..7482cb1b70d 100644
--- a/cpp/tests/io/json_chunked_reader.cpp
+++ b/cpp/tests/io/json_chunked_reader.cpp
@@ -24,11 +24,19 @@
 
 #include <rmm/resource_ref.hpp>
 
+#include <fstream>
+#include <string>
+#include <vector>
+
 /**
  * @brief Base test fixture for JSON reader tests
  */
 struct JsonReaderTest : public cudf::test::BaseFixture {};
 
+cudf::test::TempDirTestEnvironment* const temp_env =
+  static_cast<cudf::test::TempDirTestEnvironment*>(
+    ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
+
 // function to extract first delimiter in the string in each chunk,
 // collate together and form byte_range for each chunk,
 // parse separately.
@@ -41,7 +49,6 @@ std::vector<cudf::io::table_with_metadata> skeleton_for_parellel_chunk_reader(
 {
   using namespace cudf::io::json::detail;
   using cudf::size_type;
-  // assuming single source.
   size_t total_source_size = 0;
   for (auto const& source : sources) {
     total_source_size += source->size();
@@ -77,7 +84,9 @@ std::vector<cudf::io::table_with_metadata> skeleton_for_parellel_chunk_reader(
   std::vector<cudf::io::table_with_metadata> tables;
   // Process each chunk in parallel.
   for (auto const& [chunk_start, chunk_end] : record_ranges) {
-    if (chunk_start == -1 or chunk_end == -1) continue;
+    if (chunk_start == -1 or chunk_end == -1 or
+        static_cast<size_t>(chunk_start) >= total_source_size)
+      continue;
     reader_opts_chunk.set_byte_range_offset(chunk_start);
     reader_opts_chunk.set_byte_range_size(chunk_end - chunk_start);
     tables.push_back(read_json(sources, reader_opts_chunk, stream, mr));
@@ -87,7 +96,7 @@ std::vector<cudf::io::table_with_metadata> skeleton_for_parellel_chunk_reader(
   return tables;
 }
 
-TEST_F(JsonReaderTest, ByteRange)
+TEST_F(JsonReaderTest, ByteRange_SingleSource)
 {
   std::string const json_string = R"(
     { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
@@ -126,3 +135,98 @@ TEST_F(JsonReaderTest, ByteRange)
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(current_reader_table.tbl->view(), result->view());
   }
 }
+
+TEST_F(JsonReaderTest, ReadCompleteFiles)
+{
+  std::string const json_string = R"(
+    { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
+    { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
+    { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
+    { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
+  auto filename                 = temp_env->get_temp_dir() + "ParseInRangeIntegers.json";
+  {
+    std::ofstream outfile(filename, std::ofstream::out);
+    outfile << json_string;
+  }
+
+  constexpr int num_sources = 5;
+  std::vector<std::string> filepaths(num_sources, filename);
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepaths})
+      .lines(true)
+      .recovery_mode(cudf::io::json_recovery_mode_t::FAIL);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  std::vector<cudf::io::table_with_metadata> part_tables;
+  for (auto filepath : filepaths) {
+    cudf::io::json_reader_options part_in_options =
+      cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
+        .lines(true)
+        .recovery_mode(cudf::io::json_recovery_mode_t::FAIL);
+
+    part_tables.push_back(cudf::io::read_json(part_in_options));
+  }
+
+  auto part_table_views = std::vector<cudf::table_view>(part_tables.size());
+  std::transform(part_tables.begin(), part_tables.end(), part_table_views.begin(), [](auto& table) {
+    return table.tbl->view();
+  });
+
+  auto expected_result = cudf::concatenate(part_table_views);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result.tbl->view(), expected_result->view());
+}
+
+TEST_F(JsonReaderTest, ByteRange_MultiSource)
+{
+  std::string const json_string = R"(
+    { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
+    { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
+    { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
+    { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
+  auto filename                 = temp_env->get_temp_dir() + "ParseInRangeIntegers.json";
+  {
+    std::ofstream outfile(filename, std::ofstream::out);
+    outfile << json_string;
+  }
+
+  constexpr int num_sources = 5;
+  std::vector<std::string> filepaths(num_sources, filename);
+
+  // Initialize parsing options (reading json lines)
+  cudf::io::json_reader_options json_lines_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepaths})
+      .lines(true)
+      .compression(cudf::io::compression_type::NONE)
+      .recovery_mode(cudf::io::json_recovery_mode_t::FAIL);
+
+  // Read full test data via existing, nested JSON lines reader
+  cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options);
+
+  auto file_paths = json_lines_options.get_source().filepaths();
+  std::vector<std::unique_ptr<cudf::io::datasource>> datasources;
+  for (auto& fp : file_paths) {
+    datasources.emplace_back(cudf::io::datasource::create(fp));
+  }
+
+  // Test for different chunk sizes
+  for (auto chunk_size : {7, 10, 15, 20, 40, 50, 100, 200, 500, 1000, 2000}) {
+    auto const tables = skeleton_for_parellel_chunk_reader(datasources,
+                                                           json_lines_options,
+                                                           chunk_size,
+                                                           cudf::get_default_stream(),
+                                                           rmm::mr::get_current_device_resource());
+
+    auto table_views = std::vector<cudf::table_view>(tables.size());
+    std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) {
+      return table.tbl->view();
+    });
+    auto result = cudf::concatenate(table_views);
+
+    // Verify that the data read via chunked reader matches the data read via nested JSON reader
+    // cannot use EQUAL due to concatenate removing null mask
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(current_reader_table.tbl->view(), result->view());
+  }
+}

From ce1933fc07d5f8d1da3ad36217ea0b39d7a926fa Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 10 May 2024 20:40:41 -0700
Subject: [PATCH 206/272] Change the default dictionary policy in Parquet
 writer from `ALWAYS` to `ADAPTIVE` (#15570)

This PR changes the default dictionary policy in parquet from `ALWAYS` to `ADAPTIVE` and adds an argument `max_dictionary_size` to control the `ADAPTIVE`-ness of the dictionary policy. This change prevents a silent fallback to `UNCOMPRESSED` when writing parquet files with `ZSTD` compression leading to better performance for several use cases.

Partially closes #15501.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15570
---
 cpp/include/cudf/io/parquet.hpp          |  8 ++---
 python/cudf/cudf/_lib/cpp/io/parquet.pxd | 14 +++++++--
 python/cudf/cudf/_lib/parquet.pyx        | 28 ++++++++++++++++--
 python/cudf/cudf/io/parquet.py           |  4 +++
 python/cudf/cudf/tests/test_parquet.py   | 37 ++++++++++++++++++++++++
 python/cudf/cudf/utils/ioutils.py        |  8 +++--
 6 files changed, 88 insertions(+), 11 deletions(-)

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 0406d6e3e4c..8bfcacdb47f 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -564,7 +564,7 @@ class parquet_writer_options {
   // Maximum size of min or max values in column index
   int32_t _column_index_truncate_length = default_column_index_truncate_length;
   // When to use dictionary encoding for data
-  dictionary_policy _dictionary_policy = dictionary_policy::ALWAYS;
+  dictionary_policy _dictionary_policy = dictionary_policy::ADAPTIVE;
   // Maximum size of column chunk dictionary (in bytes)
   size_t _max_dictionary_size = default_max_dictionary_size;
   // Maximum number of rows in a page fragment
@@ -1095,7 +1095,7 @@ class parquet_writer_options_builder {
    * dictionary_policy::ALWAYS will allow the use of dictionary encoding even if it will result in
    * the disabling of compression for columns that would otherwise be compressed.
    *
-   * The default value is dictionary_policy::ALWAYS.
+   * The default value is dictionary_policy::ADAPTIVE.
    *
    * @param val policy for dictionary use
    * @return this for chaining
@@ -1258,7 +1258,7 @@ class chunked_parquet_writer_options {
   // Maximum size of min or max values in column index
   int32_t _column_index_truncate_length = default_column_index_truncate_length;
   // When to use dictionary encoding for data
-  dictionary_policy _dictionary_policy = dictionary_policy::ALWAYS;
+  dictionary_policy _dictionary_policy = dictionary_policy::ADAPTIVE;
   // Maximum size of column chunk dictionary (in bytes)
   size_t _max_dictionary_size = default_max_dictionary_size;
   // Maximum number of rows in a page fragment
@@ -1751,7 +1751,7 @@ class chunked_parquet_writer_options_builder {
    * dictionary_policy::ALWAYS will allow the use of dictionary encoding even if it will result in
    * the disabling of compression for columns that would otherwise be compressed.
    *
-   * The default value is dictionary_policy::ALWAYS.
+   * The default value is dictionary_policy::ADAPTIVE.
    *
    * @param val policy for dictionary use
    * @return this for chaining
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index 8de16d06a9d..1680eb43700 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -74,6 +74,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         size_type get_row_group_size_rows() except +
         size_t get_max_page_size_bytes() except +
         size_type get_max_page_size_rows() except +
+        size_t get_max_dictionary_size() except +
 
         void set_partitions(
             vector[cudf_io_types.partition_info] partitions
@@ -103,8 +104,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_row_group_size_rows(size_type val) except +
         void set_max_page_size_bytes(size_t val) except +
         void set_max_page_size_rows(size_type val) except +
+        void set_max_dictionary_size(size_t val) except +
         void enable_write_v2_headers(bool val) except +
-        void set_dictionary_policy(cudf_io_types.dictionary_policy policy)except +
+        void set_dictionary_policy(cudf_io_types.dictionary_policy policy) except +
 
         @staticmethod
         parquet_writer_options_builder builder(
@@ -155,6 +157,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_writer_options_builder& max_page_size_rows(
             size_type val
         ) except +
+        parquet_writer_options_builder& max_dictionary_size(
+            size_t val
+        ) except +
         parquet_writer_options_builder& write_v2_headers(
             bool val
         ) except +
@@ -179,6 +184,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         size_type get_row_group_size_rows() except +
         size_t get_max_page_size_bytes() except +
         size_type get_max_page_size_rows() except +
+        size_t get_max_dictionary_size() except +
 
         void set_metadata(
             cudf_io_types.table_input_metadata m
@@ -202,8 +208,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_row_group_size_rows(size_type val) except +
         void set_max_page_size_bytes(size_t val) except +
         void set_max_page_size_rows(size_type val) except +
+        void set_max_dictionary_size(size_t val) except +
         void enable_write_v2_headers(bool val) except +
-        void set_dictionary_policy(cudf_io_types.dictionary_policy policy)except +
+        void set_dictionary_policy(cudf_io_types.dictionary_policy policy) except +
 
         @staticmethod
         chunked_parquet_writer_options_builder builder(
@@ -245,6 +252,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         chunked_parquet_writer_options_builder& max_page_size_rows(
             size_type val
         ) except +
+        chunked_parquet_writer_options_builder& max_dictionary_size(
+            size_t val
+        ) except +
         parquet_writer_options_builder& write_v2_headers(
             bool val
         ) except +
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 9ce9aad18f7..dcfa087a1fa 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -399,6 +399,7 @@ def write_parquet(
     object row_group_size_rows=None,
     object max_page_size_bytes=None,
     object max_page_size_rows=None,
+    object max_dictionary_size=None,
     object partitions_info=None,
     object force_nullable_schema=False,
     header_version="1.0",
@@ -478,7 +479,7 @@ def write_parquet(
         )
 
     dict_policy = (
-        cudf_io_types.dictionary_policy.ALWAYS
+        cudf_io_types.dictionary_policy.ADAPTIVE
         if use_dictionary
         else cudf_io_types.dictionary_policy.NEVER
     )
@@ -528,6 +529,8 @@ def write_parquet(
         args.set_max_page_size_bytes(max_page_size_bytes)
     if max_page_size_rows is not None:
         args.set_max_page_size_rows(max_page_size_rows)
+    if max_dictionary_size is not None:
+        args.set_max_dictionary_size(max_dictionary_size)
 
     with nogil:
         out_metadata_c = move(parquet_writer(args))
@@ -571,7 +574,14 @@ cdef class ParquetWriter:
     max_page_size_rows: int, default 20000
         Maximum number of rows of each page of the output.
         By default, 20000 will be used.
-
+    max_dictionary_size: int, default 1048576
+        Maximum size of the dictionary page for each output column chunk. Dictionary
+        encoding for column chunks that exceeds this limit will be disabled.
+        By default, 1048576 (1MB) will be used.
+    use_dictionary : bool, default True
+        If ``True``, enable dictionary encoding for Parquet page data
+        subject to ``max_dictionary_size`` constraints.
+        If ``False``, disable dictionary encoding for Parquet page data.
     See Also
     --------
     cudf.io.parquet.write_parquet
@@ -588,13 +598,17 @@ cdef class ParquetWriter:
     cdef size_type row_group_size_rows
     cdef size_t max_page_size_bytes
     cdef size_type max_page_size_rows
+    cdef size_t max_dictionary_size
+    cdef cudf_io_types.dictionary_policy dict_policy
 
     def __cinit__(self, object filepath_or_buffer, object index=None,
                   object compression="snappy", str statistics="ROWGROUP",
                   int row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT,
                   int row_group_size_rows=1000000,
                   int max_page_size_bytes=524288,
-                  int max_page_size_rows=20000):
+                  int max_page_size_rows=20000,
+                  int max_dictionary_size=1048576,
+                  bool use_dictionary=True):
         filepaths_or_buffers = (
             list(filepath_or_buffer)
             if is_list_like(filepath_or_buffer)
@@ -609,6 +623,12 @@ cdef class ParquetWriter:
         self.row_group_size_rows = row_group_size_rows
         self.max_page_size_bytes = max_page_size_bytes
         self.max_page_size_rows = max_page_size_rows
+        self.max_dictionary_size = max_dictionary_size
+        self.dict_policy = (
+            cudf_io_types.dictionary_policy.ADAPTIVE
+            if use_dictionary
+            else cudf_io_types.dictionary_policy.NEVER
+        )
 
     def write_table(self, table, object partitions_info=None):
         """ Writes a single table to the file """
@@ -726,8 +746,10 @@ cdef class ParquetWriter:
                 .row_group_size_rows(self.row_group_size_rows)
                 .max_page_size_bytes(self.max_page_size_bytes)
                 .max_page_size_rows(self.max_page_size_rows)
+                .max_dictionary_size(self.max_dictionary_size)
                 .build()
             )
+            args.set_dictionary_policy(self.dict_policy)
             self.writer.reset(new cpp_parquet_chunked_writer(args))
         self.initialized = True
 
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index dd1e59acaaa..a6c67d22af7 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -63,6 +63,7 @@ def _write_parquet(
     row_group_size_rows=None,
     max_page_size_bytes=None,
     max_page_size_rows=None,
+    max_dictionary_size=None,
     partitions_info=None,
     storage_options=None,
     force_nullable_schema=False,
@@ -96,6 +97,7 @@ def _write_parquet(
         "row_group_size_rows": row_group_size_rows,
         "max_page_size_bytes": max_page_size_bytes,
         "max_page_size_rows": max_page_size_rows,
+        "max_dictionary_size": max_dictionary_size,
         "partitions_info": partitions_info,
         "force_nullable_schema": force_nullable_schema,
         "header_version": header_version,
@@ -898,6 +900,7 @@ def to_parquet(
     row_group_size_rows=None,
     max_page_size_bytes=None,
     max_page_size_rows=None,
+    max_dictionary_size=None,
     storage_options=None,
     return_metadata=False,
     force_nullable_schema=False,
@@ -974,6 +977,7 @@ def to_parquet(
             row_group_size_rows=row_group_size_rows,
             max_page_size_bytes=max_page_size_bytes,
             max_page_size_rows=max_page_size_rows,
+            max_dictionary_size=max_dictionary_size,
             partitions_info=partition_info,
             storage_options=storage_options,
             force_nullable_schema=force_nullable_schema,
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index cf3c0e7f7a0..3680c1e0c62 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1890,6 +1890,43 @@ def test_parquet_writer_max_page_size(tmpdir, max_page_size_kwargs):
     assert s1 > s2
 
 
+@pytest.mark.parametrize("use_dict", [False, True])
+@pytest.mark.parametrize("max_dict_size", [0, 1048576])
+def test_parquet_writer_dictionary_setting(use_dict, max_dict_size):
+    # Simple test for checking the validity of dictionary encoding setting
+    # and behavior of ParquetWriter in cudf.
+    # Write a table with repetitive data with varying dictionary settings.
+    # Make sure the written columns are dictionary-encoded accordingly.
+
+    # Table with repetitive data
+    table = cudf.DataFrame(
+        {
+            "int32": cudf.Series([1024] * 1024, dtype="int64"),
+        }
+    )
+
+    # Write to Parquet using ParquetWriter
+    buffer = BytesIO()
+    writer = ParquetWriter(
+        buffer,
+        use_dictionary=use_dict,
+        max_dictionary_size=max_dict_size,
+    )
+    writer.write_table(table)
+    writer.close()
+
+    # Read encodings from parquet file
+    got = pq.ParquetFile(buffer)
+    encodings = got.metadata.row_group(0).column(0).encodings
+
+    # Check for `PLAIN_DICTIONARY` encoding if dictionary encoding enabled
+    # and dictionary page limit > 0
+    if use_dict is True and max_dict_size > 0:
+        assert "PLAIN_DICTIONARY" in encodings
+    else:
+        assert "PLAIN_DICTIONARY" not in encodings
+
+
 @pytest.mark.parametrize("filename", ["myfile.parquet", None])
 @pytest.mark.parametrize("cols", [["b"], ["c", "b"]])
 def test_parquet_partitioned(tmpdir_factory, cols, filename):
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 9c7c687a6ed..18e81078587 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -278,6 +278,10 @@
 max_page_size_rows: integer or None, default None
     Maximum number of rows of each page of the output.
     If None, 20000 will be used.
+max_dictionary_size: integer or None, default None
+    Maximum size of the dictionary page for each output column chunk. Dictionary
+    encoding for column chunks that exceeds this limit will be disabled.
+    If None, 1048576 (1MB) will be used.
 storage_options : dict, optional, default None
     Extra options that make sense for a particular storage connection,
     e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
@@ -292,8 +296,8 @@
     ``return_metadata=True`` instead of specifying ``metadata_file_path``
 use_dictionary : bool, default True
     When ``False``, prevents the use of dictionary encoding for Parquet page
-    data. When ``True``, dictionary encoding is preferred when not disabled due
-    to dictionary size constraints.
+    data. When ``True``, dictionary encoding is preferred subject to
+    ``max_dictionary_size`` constraints.
 header_version : {{'1.0', '2.0'}}, default "1.0"
     Controls whether to use version 1.0 or version 2.0 page headers when
     encoding. Version 1.0 is more portable, but version 2.0 enables the

From 425a5dac64b7c74c061b588dc8725c5390517cf9 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Sun, 12 May 2024 15:44:24 -0500
Subject: [PATCH 207/272] Return same type as the original index for `.loc`
 operations (#15717)

Fixes: #15716

This PR makes changes to `.loc` by preserving the original type at the end of the operation.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15717
---
 python/cudf/cudf/core/dataframe.py       |  5 +++++
 python/cudf/cudf/tests/test_dataframe.py | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index b937d2da25c..b29089cb81a 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -357,6 +357,11 @@ def _getitem_tuple_arg(self, arg):
                     # as join is not assigning any names to index,
                     # update it over here
                     df.index.name = columns_df.index.name
+                    if not isinstance(
+                        df.index, MultiIndex
+                    ) and is_numeric_dtype(df.index.dtype):
+                        # Preserve the original index type.
+                        df.index = df.index.astype(self._frame.index.dtype)
                     df = df.sort_values(by=[tmp_col_name, cantor_name])
                     df.drop(columns=[tmp_col_name, cantor_name], inplace=True)
                     # There were no indices found
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 8550bc91253..96301670e9c 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -11010,3 +11010,21 @@ def test_dataframe_init_with_nans():
     assert gdf["a"].dtype == np.dtype("float64")
     pdf = pd.DataFrame({"a": [1, 2, 3, np.nan]})
     assert_eq(pdf, gdf)
+
+
+@pytest.mark.parametrize("dtype1", ["int16", "float32"])
+@pytest.mark.parametrize("dtype2", ["int16", "float32"])
+def test_dataframe_loc_int_float(dtype1, dtype2):
+    df = cudf.DataFrame(
+        {"a": [10, 11, 12, 13, 14]},
+        index=cudf.Index([1, 2, 3, 4, 5], dtype=dtype1),
+    )
+    pdf = df.to_pandas()
+
+    gidx = cudf.Index([2, 3, 4], dtype=dtype2)
+    pidx = gidx.to_pandas()
+
+    actual = df.loc[gidx]
+    expected = pdf.loc[pidx]
+
+    assert_eq(actual, expected, check_index_type=True, check_dtype=True)

From bff301527d074cd8f98d1a2d8dddedbf8830dffd Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Mon, 13 May 2024 01:33:29 -0700
Subject: [PATCH 208/272] Adding parquet transcoding example (#15420)

This PR adds a new example `parquet_io` to `libcudf/cpp/examples` instrumenting reading and writing parquet files with different column encodings (same for all columns for now) and compressions to close #15344. The example maybe elaborated and/or evolved as needed. #15348 should be merged before this PR to get all CMake updates needed to successfully build and run this example.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15420
---
 ci/run_cudf_examples.sh                 |   3 +
 cpp/examples/build.sh                   |   1 +
 cpp/examples/parquet_io/CMakeLists.txt  |  25 ++++
 cpp/examples/parquet_io/example.parquet | Bin 0 -> 614 bytes
 cpp/examples/parquet_io/parquet_io.cpp  | 172 ++++++++++++++++++++++++
 cpp/examples/parquet_io/parquet_io.hpp  | 157 +++++++++++++++++++++
 6 files changed, 358 insertions(+)
 create mode 100644 cpp/examples/parquet_io/CMakeLists.txt
 create mode 100644 cpp/examples/parquet_io/example.parquet
 create mode 100644 cpp/examples/parquet_io/parquet_io.cpp
 create mode 100644 cpp/examples/parquet_io/parquet_io.hpp

diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh
index f3561bc595c..0819eacf636 100755
--- a/ci/run_cudf_examples.sh
+++ b/ci/run_cudf_examples.sh
@@ -23,4 +23,7 @@ compute-sanitizer --tool memcheck custom_optimized names.csv
 compute-sanitizer --tool memcheck custom_prealloc names.csv
 compute-sanitizer --tool memcheck custom_with_malloc names.csv
 
+compute-sanitizer --tool memcheck parquet_io
+compute-sanitizer --tool memcheck parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD TRUE
+
 exit ${EXITCODE}
diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh
index 9802c876930..bde6ef7d69c 100755
--- a/cpp/examples/build.sh
+++ b/cpp/examples/build.sh
@@ -59,3 +59,4 @@ build_example() {
 build_example basic
 build_example strings
 build_example nested_types
+build_example parquet_io
diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt
new file mode 100644
index 00000000000..d8e9205ffd4
--- /dev/null
+++ b/cpp/examples/parquet_io/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+cmake_minimum_required(VERSION 3.26.4)
+
+include(../set_cuda_architecture.cmake)
+
+# initialize cuda architecture
+rapids_cuda_init_architectures(parquet_io)
+rapids_cuda_set_architectures(RAPIDS)
+
+project(
+  parquet_io
+  VERSION 0.0.1
+  LANGUAGES CXX CUDA
+)
+
+include(../fetch_dependencies.cmake)
+
+# Configure your project here
+add_executable(parquet_io parquet_io.cpp)
+target_link_libraries(parquet_io PRIVATE cudf::cudf)
+target_compile_features(parquet_io PRIVATE cxx_std_17)
+
+install(TARGETS parquet_io DESTINATION bin/examples/libcudf)
+install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.parquet DESTINATION bin/examples/libcudf)
diff --git a/cpp/examples/parquet_io/example.parquet b/cpp/examples/parquet_io/example.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..f0fb5319cb040395b3d3841501e48de01fc0c7ba
GIT binary patch
literal 614
zcmZ9K!A^rf5Qew4MLc*hSpo?MveCvvB~Y7cOfOy<k4>YerfIgYttc#HSyWPKd?l}a
z44=qdXtb2mKmW}9^92~+Ph1F~1JAoq5ki3q0J{3q0_{fY>$H8Wg6-x+g-xUTprqb2
zi7;#e*t!6EBMm>#%ps7ErWmGSc7O?61_Xxa!KS8Y6K&K{hXZh_Za3vz6<tZOHPIAc
zn7Wc+>f==ZK3b!I`s8Eo#bFlF6x~_VHF_1EP>5l=BB*O2iX$QnnuVdLQjQrHrBmuS
z5%Ri}L`t`qiK5Dzm*Hd=TTfXAB=qNXufm%1Jdx*6MguSQD}pRa^1nvBcp)B_WD()m
z#~h3CH6m1u2XrNOpB_h7x2V{IsZ*d-uGb(c>ww|^6s;lfAR;~qIUe+<G@Tu=$TwA;
zW(4z?m3U#WV3_5SF3B-t@w?o5BP<PKDqXj4J9ZBh!rf8qHG6i?c2NOz=w<%W+I6g+
U*csT4W0<D+$~R2_#^MkA1w)>u9RL6T

literal 0
HcmV?d00001

diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
new file mode 100644
index 00000000000..8be17db3781
--- /dev/null
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "parquet_io.hpp"
+
+/**
+ * @file parquet_io.cpp
+ * @brief Demonstrates usage of the libcudf APIs to read and write
+ * parquet file format with different encodings and compression types
+ *
+ * The following encoding and compression ztypes are demonstrated:
+ * Encoding Types: DEFAULT, DICTIONARY, PLAIN, DELTA_BINARY_PACKED,
+ *                 DELTA_LENGTH_BYTE_ARRAY, DELTA_BYTE_ARRAY
+ *
+ * Compression Types: NONE, AUTO, SNAPPY, LZ4, ZSTD
+ *
+ */
+
+/**
+ * @brief Read parquet input from file
+ *
+ * @param filepath path to input parquet file
+ * @return cudf::io::table_with_metadata
+ */
+cudf::io::table_with_metadata read_parquet(std::string filepath)
+{
+  auto source_info = cudf::io::source_info(filepath);
+  auto builder     = cudf::io::parquet_reader_options::builder(source_info);
+  auto options     = builder.build();
+  return cudf::io::read_parquet(options);
+}
+
+/**
+ * @brief Write parquet output to file
+ *
+ * @param input table to write
+ * @param metadata metadata of input table read by parquet reader
+ * @param filepath path to output parquet file
+ * @param stats_level optional page size stats level
+ */
+void write_parquet(cudf::table_view input,
+                   cudf::io::table_metadata metadata,
+                   std::string filepath,
+                   cudf::io::column_encoding encoding,
+                   cudf::io::compression_type compression,
+                   std::optional<cudf::io::statistics_freq> stats_level)
+{
+  // write the data for inspection
+  auto sink_info      = cudf::io::sink_info(filepath);
+  auto builder        = cudf::io::parquet_writer_options::builder(sink_info, input);
+  auto table_metadata = cudf::io::table_input_metadata{metadata};
+
+  std::for_each(table_metadata.column_metadata.begin(),
+                table_metadata.column_metadata.end(),
+                [=](auto& col_meta) { col_meta.set_encoding(encoding); });
+
+  builder.metadata(table_metadata);
+  auto options = builder.build();
+  options.set_compression(compression);
+  // Either use the input stats level or don't write stats
+  options.set_stats_level(stats_level.value_or(cudf::io::statistics_freq::STATISTICS_NONE));
+
+  // write parquet data
+  cudf::io::write_parquet(options);
+}
+
+/**
+ * @brief Main for nested_types examples
+ *
+ * Command line parameters:
+ * 1. parquet input file name/path (default: "example.parquet")
+ * 2. parquet output file name/path (default: "output.parquet")
+ * 3. encoding type for columns (default: "DELTA_BINARY_PACKED")
+ * 4. compression type (default: "ZSTD")
+ * 5. optional: use page size stats metadata (default: "NO")
+ *
+ * Example invocation from directory `cudf/cpp/examples/parquet_io`:
+ * ./build/parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD
+ *
+ */
+int main(int argc, char const** argv)
+{
+  std::string input_filepath;
+  std::string output_filepath;
+  cudf::io::column_encoding encoding;
+  cudf::io::compression_type compression;
+  std::optional<cudf::io::statistics_freq> page_stats;
+
+  switch (argc) {
+    case 1:
+      input_filepath  = "example.parquet";
+      output_filepath = "output.parquet";
+      encoding        = get_encoding_type("DELTA_BINARY_PACKED");
+      compression     = get_compression_type("ZSTD");
+      break;
+    case 6: page_stats = get_page_size_stats(argv[5]); [[fallthrough]];
+    case 5:
+      input_filepath  = argv[1];
+      output_filepath = argv[2];
+      encoding        = get_encoding_type(argv[3]);
+      compression     = get_compression_type(argv[4]);
+      break;
+    default:
+      throw std::runtime_error(
+        "Either provide all command-line arguments, or none to use defaults\n");
+  }
+
+  // Create and use a memory pool
+  bool is_pool_used = true;
+  auto resource     = create_memory_resource(is_pool_used);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  // Read input parquet file
+  // We do not want to time the initial read time as it may include
+  // time for nvcomp, cufile loading and RMM growth
+  std::cout << std::endl << "Reading " << input_filepath << "..." << std::endl;
+  std::cout << "Note: Not timing the initial parquet read as it may include\n"
+               "times for nvcomp, cufile loading and RMM growth."
+            << std::endl
+            << std::endl;
+  auto [input, metadata] = read_parquet(input_filepath);
+
+  // Status string to indicate if page stats are set to be written or not
+  auto page_stat_string = (page_stats.has_value()) ? "page stats" : "no page stats";
+  // Write parquet file with the specified encoding and compression
+  std::cout << "Writing " << output_filepath << " with encoding, compression and "
+            << page_stat_string << ".." << std::endl;
+
+  // `timer` is automatically started here
+  Timer timer;
+  write_parquet(input->view(), metadata, output_filepath, encoding, compression, page_stats);
+  timer.print_elapsed_millis();
+
+  // Read the parquet file written with encoding and compression
+  std::cout << "Reading " << output_filepath << "..." << std::endl;
+
+  // Reset the timer
+  timer.reset();
+  auto [transcoded_input, transcoded_metadata] = read_parquet(output_filepath);
+  timer.print_elapsed_millis();
+
+  // Check for validity
+  try {
+    // Left anti-join the original and transcoded tables
+    // identical tables should not throw an exception and
+    // return an empty indices vector
+    auto const indices = cudf::left_anti_join(
+      input->view(), transcoded_input->view(), cudf::null_equality::EQUAL, resource.get());
+
+    // No exception thrown, check indices
+    auto const valid = indices->size() == 0;
+    std::cout << "Transcoding valid: " << std::boolalpha << valid << std::endl;
+  } catch (std::exception& e) {
+    std::cerr << e.what() << std::endl << std::endl;
+    std::cout << "Transcoding valid: false" << std::endl;
+  }
+
+  return 0;
+}
diff --git a/cpp/examples/parquet_io/parquet_io.hpp b/cpp/examples/parquet_io/parquet_io.hpp
new file mode 100644
index 00000000000..d2fc359a2fe
--- /dev/null
+++ b/cpp/examples/parquet_io/parquet_io.hpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/io/parquet.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/join.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <chrono>
+#include <iostream>
+#include <optional>
+#include <string>
+
+/**
+ * @brief Create memory resource for libcudf functions
+ *
+ * @param pool Whether to use a pool memory resource.
+ * @return Memory resource instance
+ */
+std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_pool_used)
+{
+  auto cuda_mr = std::make_shared<rmm::mr::cuda_memory_resource>();
+  if (is_pool_used) {
+    return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+      cuda_mr, rmm::percent_of_free_device_memory(50));
+  }
+  return cuda_mr;
+}
+
+/**
+ * @brief Get encoding type from the keyword
+ *
+ * @param name encoding keyword name
+ * @return corresponding column encoding type
+ */
+[[nodiscard]] cudf::io::column_encoding get_encoding_type(std::string name)
+{
+  using encoding_type = cudf::io::column_encoding;
+
+  static const std::unordered_map<std::string_view, cudf::io::column_encoding> map = {
+    {"DEFAULT", encoding_type::USE_DEFAULT},
+    {"DICTIONARY", encoding_type::DICTIONARY},
+    {"PLAIN", encoding_type::PLAIN},
+    {"DELTA_BINARY_PACKED", encoding_type::DELTA_BINARY_PACKED},
+    {"DELTA_LENGTH_BYTE_ARRAY", encoding_type::DELTA_LENGTH_BYTE_ARRAY},
+    {"DELTA_BYTE_ARRAY", encoding_type::DELTA_BYTE_ARRAY},
+  };
+
+  std::transform(name.begin(), name.end(), name.begin(), ::toupper);
+  if (map.find(name) != map.end()) { return map.at(name); }
+  throw std::invalid_argument("FATAL: " + std::string(name) +
+                              " is not a valid encoding type.\n\n"
+                              "Available encoding types: DEFAULT, DICTIONARY, PLAIN,\n"
+                              "DELTA_BINARY_PACKED, DELTA_LENGTH_BYTE_ARRAY,\n"
+                              "DELTA_BYTE_ARRAY\n"
+                              "\n"
+                              "Exiting...\n");
+}
+
+/**
+ * @brief Get compression type from the keyword
+ *
+ * @param name compression keyword name
+ * @return corresponding compression type
+ */
+[[nodiscard]] cudf::io::compression_type get_compression_type(std::string name)
+{
+  using compression_type = cudf::io::compression_type;
+
+  static const std::unordered_map<std::string_view, cudf::io::compression_type> map = {
+    {"NONE", compression_type::NONE},
+    {"AUTO", compression_type::AUTO},
+    {"SNAPPY", compression_type::SNAPPY},
+    {"LZ4", compression_type::LZ4},
+    {"ZSTD", compression_type::ZSTD}};
+
+  std::transform(name.begin(), name.end(), name.begin(), ::toupper);
+  if (map.find(name) != map.end()) { return map.at(name); }
+  throw std::invalid_argument("FATAL: " + std::string(name) +
+                              " is not a valid compression type.\n\n"
+                              "Available compression_type types: NONE, AUTO, SNAPPY,\n"
+                              "LZ4, ZSTD\n"
+                              "\n"
+                              "Exiting...\n");
+}
+
+/**
+ * @brief Get the optional page size stat frequency from they keyword
+ *
+ * @param use_stats keyword affirmation string such as: Y, T, YES, TRUE, ON
+ * @return optional page statistics frequency set to full (STATISTICS_COLUMN)
+ */
+[[nodiscard]] std::optional<cudf::io::statistics_freq> get_page_size_stats(std::string use_stats)
+{
+  std::transform(use_stats.begin(), use_stats.end(), use_stats.begin(), ::toupper);
+
+  // Check if the input string matches to any of the following
+  if (not use_stats.compare("ON") or not use_stats.compare("TRUE") or
+      not use_stats.compare("YES") or not use_stats.compare("Y") or not use_stats.compare("T")) {
+    // Full column and offset indices - STATISTICS_COLUMN
+    return std::make_optional(cudf::io::statistics_freq::STATISTICS_COLUMN);
+  }
+
+  return std::nullopt;
+}
+
+/**
+ * @brief Light-weight timer for parquet reader and writer instrumentation
+ *
+ * Timer object constructed from std::chrono, instrumenting at microseconds
+ * precision. Can display elapsed durations at milli and micro second
+ * scales. Timer starts at object construction.
+ */
+class Timer {
+ public:
+  using micros = std::chrono::microseconds;
+  using millis = std::chrono::milliseconds;
+
+  Timer() { reset(); }
+  void reset() { start_time = std::chrono::high_resolution_clock::now(); }
+  auto elapsed() { return (std::chrono::high_resolution_clock::now() - start_time); }
+  void print_elapsed_micros()
+  {
+    std::cout << "Elapsed Time: " << std::chrono::duration_cast<micros>(elapsed()).count()
+              << "us\n\n";
+  }
+  void print_elapsed_millis()
+  {
+    std::cout << "Elapsed Time: " << std::chrono::duration_cast<millis>(elapsed()).count()
+              << "ms\n\n";
+  }
+
+ private:
+  using time_point_t = std::chrono::time_point<std::chrono::high_resolution_clock>;
+  time_point_t start_time;
+};

From b4bdea295331862949afe408feb47522a4ff8f2a Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Sun, 12 May 2024 23:41:58 -1000
Subject: [PATCH 209/272] Fix ColumnAccessor caching of nrows if empty
 previously (#15710)

https://github.com/rapidsai/cudf/pull/14758 may have propagated a caching invalidation bug of the number of rows in a `ColumnAccessor`

Previously the number of rows was cached and cleared only if an operation caused the `ColumnAccessor` to have no more columns.

However, if the `ColumnAccessor` was empty and operation added new columns, the cached number of rows should have also been cleared.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15710
---
 python/cudf/cudf/core/column_accessor.py      | 47 +++++++++++++------
 .../cudf/cudf/tests/test_column_accessor.py   | 14 ++++++
 2 files changed, 47 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index fbce6e02330..9f3de061ee8 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -158,8 +158,10 @@ def __setitem__(self, key: Any, value: Any):
         self.set_by_label(key, value)
 
     def __delitem__(self, key: Any):
+        old_ncols = len(self._data)
         del self._data[key]
-        self._clear_cache()
+        new_ncols = len(self._data)
+        self._clear_cache(old_ncols, new_ncols)
 
     def __len__(self) -> int:
         return len(self._data)
@@ -253,7 +255,17 @@ def _grouped_data(self) -> abc.MutableMapping:
         else:
             return self._data
 
-    def _clear_cache(self):
+    def _clear_cache(self, old_ncols: int, new_ncols: int):
+        """
+        Clear cached attributes.
+
+        Parameters
+        ----------
+        old_ncols: int
+            len(self._data) before self._data was modified
+        new_ncols: int
+            len(self._data) after self._data was modified
+        """
         cached_properties = ("columns", "names", "_grouped_data")
         for attr in cached_properties:
             try:
@@ -261,9 +273,12 @@ def _clear_cache(self):
             except AttributeError:
                 pass
 
-        # nrows should only be cleared if no data is present.
-        if len(self._data) == 0 and hasattr(self, "nrows"):
-            del self.nrows
+        # nrows should only be cleared if empty before/after the op.
+        if (old_ncols == 0) ^ (new_ncols == 0):
+            try:
+                del self.nrows
+            except AttributeError:
+                pass
 
     def to_pandas_index(self) -> pd.Index:
         """Convert the keys of the ColumnAccessor to a Pandas Index object."""
@@ -321,27 +336,27 @@ def insert(
         """
         name = self._pad_key(name)
 
-        ncols = len(self._data)
+        old_ncols = len(self._data)
         if loc == -1:
-            loc = ncols
-        if not (0 <= loc <= ncols):
+            loc = old_ncols
+        if not (0 <= loc <= old_ncols):
             raise ValueError(
-                "insert: loc out of bounds: must be  0 <= loc <= ncols"
+                f"insert: loc out of bounds: must be  0 <= loc <= {old_ncols}"
             )
         # TODO: we should move all insert logic here
         if name in self._data:
             raise ValueError(f"Cannot insert '{name}', already exists")
-        if loc == len(self._data):
+        if loc == old_ncols:
             if validate:
                 value = column.as_column(value)
-                if len(self._data) > 0 and len(value) != self.nrows:
+                if old_ncols > 0 and len(value) != self.nrows:
                     raise ValueError("All columns must be of equal length")
             self._data[name] = value
         else:
             new_keys = self.names[:loc] + (name,) + self.names[loc:]
             new_values = self.columns[:loc] + (value,) + self.columns[loc:]
             self._data = self._data.__class__(zip(new_keys, new_values))
-        self._clear_cache()
+        self._clear_cache(old_ncols, old_ncols + 1)
 
     def copy(self, deep=False) -> ColumnAccessor:
         """
@@ -498,8 +513,10 @@ def set_by_label(self, key: Any, value: Any, validate: bool = True):
             if len(self._data) > 0 and len(value) != self.nrows:
                 raise ValueError("All columns must be of equal length")
 
+        old_ncols = len(self._data)
         self._data[key] = value
-        self._clear_cache()
+        new_ncols = len(self._data)
+        self._clear_cache(old_ncols, new_ncols)
 
     def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:
         # Might be a generator
@@ -673,10 +690,12 @@ def droplevel(self, level):
         if level < 0:
             level += self.nlevels
 
+        old_ncols = len(self._data)
         self._data = {
             _remove_key_level(key, level): value
             for key, value in self._data.items()
         }
+        new_ncols = len(self._data)
         self._level_names = (
             self._level_names[:level] + self._level_names[level + 1 :]
         )
@@ -685,7 +704,7 @@ def droplevel(self, level):
             len(self._level_names) == 1
         ):  # can't use nlevels, as it depends on multiindex
             self.multiindex = False
-        self._clear_cache()
+        self._clear_cache(old_ncols, new_ncols)
 
 
 def _keys_equal(target: Any, key: Any) -> bool:
diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py
index a8eac2edf2b..f1f6097d6a9 100644
--- a/python/cudf/cudf/tests/test_column_accessor.py
+++ b/python/cudf/cudf/tests/test_column_accessor.py
@@ -293,3 +293,17 @@ def test_replace_level_values_MultiColumn():
 
     got = ca.rename_levels(mapper={"a": "f"}, level=0)
     check_ca_equal(expect, got)
+
+
+def test_clear_nrows_empty_before():
+    ca = ColumnAccessor({})
+    assert ca.nrows == 0
+    ca.insert("new", [1])
+    assert ca.nrows == 1
+
+
+def test_clear_nrows_empty_after():
+    ca = ColumnAccessor({"new": [1]})
+    assert ca.nrows == 1
+    del ca["new"]
+    assert ca.nrows == 0

From c42c4189d3273205a75d7b3c3ab33446eefb7631 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 13 May 2024 08:06:33 -0400
Subject: [PATCH 210/272] Remove experimental namespace from
 make_strings_children (#15702)

Replaces the `cudf::strings::detail::make_strings_children` with the new `cudf::strings::detail::experimental::make_strings_children`.
No code logic has changed -- just code moved around. All current code was already using the experimental function.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15702
---
 cpp/benchmarks/json/json.cu                   |   4 +-
 .../cudf/strings/detail/strings_children.cuh  | 238 +++++++++++-------
 .../strings/detail/strings_children_ex.cuh    | 186 --------------
 cpp/src/io/csv/writer_impl.cu                 |   4 +-
 cpp/src/io/json/write_json.cu                 |   6 +-
 cpp/src/strings/capitalize.cu                 |   5 +-
 cpp/src/strings/case.cu                       |   8 +-
 cpp/src/strings/char_types/char_types.cu      |   5 +-
 cpp/src/strings/combine/concatenate.cu        |   7 +-
 cpp/src/strings/combine/join.cu               |   4 +-
 cpp/src/strings/combine/join_list_elements.cu |   6 +-
 cpp/src/strings/convert/convert_booleans.cu   |   6 +-
 cpp/src/strings/convert/convert_datetime.cu   |   4 +-
 cpp/src/strings/convert/convert_durations.cu  |  12 +-
 .../strings/convert/convert_fixed_point.cu    |   6 +-
 cpp/src/strings/convert/convert_floats.cu     |   6 +-
 cpp/src/strings/convert/convert_hex.cu        |   6 +-
 cpp/src/strings/convert/convert_integers.cu   |   6 +-
 cpp/src/strings/convert/convert_ipv4.cu       |   8 +-
 cpp/src/strings/convert/convert_lists.cu      |   4 +-
 cpp/src/strings/convert/convert_urls.cu       |   4 +-
 cpp/src/strings/filter_chars.cu               |   5 +-
 cpp/src/strings/padding.cu                    |  10 +-
 cpp/src/strings/repeat_strings.cu             |   8 +-
 cpp/src/strings/replace/multi.cu              |   4 +-
 cpp/src/strings/replace/multi_re.cu           |   4 +-
 cpp/src/strings/replace/replace.cu            |   4 +-
 cpp/src/strings/replace/replace_slice.cu      |   4 +-
 cpp/src/strings/slice.cu                      |   4 +-
 cpp/src/strings/translate.cu                  |   4 +-
 cpp/src/text/detokenize.cu                    |   4 +-
 cpp/src/text/generate_ngrams.cu               |   6 +-
 cpp/src/text/normalize.cu                     |   6 +-
 cpp/src/text/replace.cu                       |   6 +-
 34 files changed, 237 insertions(+), 367 deletions(-)
 delete mode 100644 cpp/include/cudf/strings/detail/strings_children_ex.cuh

diff --git a/cpp/benchmarks/json/json.cu b/cpp/benchmarks/json/json.cu
index eee85f3feeb..06b793bf5f1 100644
--- a/cpp/benchmarks/json/json.cu
+++ b/cpp/benchmarks/json/json.cu
@@ -20,7 +20,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/json/json.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -170,7 +170,7 @@ auto build_json_string_column(int desired_bytes, int num_rows)
   auto d_store_order = cudf::column_device_view::create(float_2bool_columns->get_column(2));
   json_benchmark_row_builder jb{
     desired_bytes, num_rows, {*d_books, *d_bicycles}, *d_book_pct, *d_misc_order, *d_store_order};
-  auto [offsets, chars] = cudf::strings::detail::experimental::make_strings_children(
+  auto [offsets, chars] = cudf::strings::detail::make_strings_children(
     jb, num_rows, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   return cudf::make_strings_column(num_rows, std::move(offsets), chars.release(), 0, {});
 }
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index 35812c0573d..f105a6dc546 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -17,7 +17,9 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
@@ -34,94 +36,6 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
-/**
- * @brief Creates child offsets and chars data by applying the template function that
- * can be used for computing the output size of each string as well as create the output
- *
- * @throws std::overflow_error if the output strings column exceeds the column size limit
- *
- * @tparam SizeAndExecuteFunction Function must accept an index and return a size.
- *         It must also have members d_offsets and d_chars which are set to
- *         memory containing the offsets and chars columns during write.
- *
- * @param size_and_exec_fn This is called twice. Once for the output size of each string
- *        and once again to fill in the memory pointed to by d_chars.
- * @param exec_size Number of rows for executing the `size_and_exec_fn` function.
- * @param strings_count Number of strings.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned columns' device memory.
- * @return Offsets child column and chars data for a strings column
- */
-template <typename SizeAndExecuteFunction>
-auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
-                           size_type exec_size,
-                           size_type strings_count,
-                           rmm::cuda_stream_view stream,
-                           rmm::device_async_resource_ref mr)
-{
-  auto offsets_column = make_numeric_column(
-    data_type{type_to_id<size_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
-  auto offsets_view          = offsets_column->mutable_view();
-  auto d_offsets             = offsets_view.template data<int32_t>();
-  size_and_exec_fn.d_offsets = d_offsets;
-
-  // This is called twice -- once for offsets and once for chars.
-  // Reducing the number of places size_and_exec_fn is inlined speeds up compile time.
-  auto for_each_fn = [exec_size, stream](SizeAndExecuteFunction& size_and_exec_fn) {
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       exec_size,
-                       size_and_exec_fn);
-  };
-
-  // Compute the output sizes
-  for_each_fn(size_and_exec_fn);
-
-  // Convert the sizes to offsets
-  auto const bytes =
-    cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
-  CUDF_EXPECTS(bytes <= std::numeric_limits<size_type>::max(),
-               "Size of output exceeds the column size limit",
-               std::overflow_error);
-
-  // Now build the chars column
-  rmm::device_uvector<char> chars(bytes, stream, mr);
-
-  // Execute the function fn again to fill the chars column.
-  // Note that if the output chars column has zero size, the function fn should not be called to
-  // avoid accidentally overwriting the offsets.
-  if (bytes > 0) {
-    size_and_exec_fn.d_chars = chars.data();
-    for_each_fn(size_and_exec_fn);
-  }
-
-  return std::pair(std::move(offsets_column), std::move(chars));
-}
-
-/**
- * @brief Creates child offsets and chars columns by applying the template function that
- * can be used for computing the output size of each string as well as create the output.
- *
- * @tparam SizeAndExecuteFunction Function must accept an index and return a size.
- *         It must also have members d_offsets and d_chars which are set to
- *         memory containing the offsets and chars columns during write.
- *
- * @param size_and_exec_fn This is called twice. Once for the output size of each string
- *        and once again to fill in the memory pointed to by d_chars.
- * @param strings_count Number of strings.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned columns' device memory.
- * @return offsets child column and chars child column for a strings column
- */
-template <typename SizeAndExecuteFunction>
-auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
-                           size_type strings_count,
-                           rmm::cuda_stream_view stream,
-                           rmm::device_async_resource_ref mr)
-{
-  return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr);
-}
-
 /**
  * @brief Create an offsets column to be a child of a compound column
  *
@@ -182,6 +96,154 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
   return std::pair(std::move(offsets_column), total_bytes);
 }
 
+/**
+ * @brief Kernel used by make_strings_children for calling the given functor
+ *
+ * @tparam SizeAndExecuteFunction Functor type to call in each thread
+ *
+ * @param fn Functor to call in each thread
+ * @param exec_size Total number of threads to be processed by this kernel
+ */
+template <typename SizeAndExecuteFunction>
+CUDF_KERNEL void strings_children_kernel(SizeAndExecuteFunction fn, size_type exec_size)
+{
+  auto tid = cudf::detail::grid_1d::global_thread_id();
+  if (tid < exec_size) { fn(tid); }
+}
+
+/**
+ * @brief Creates child offsets and chars data by applying the template function that
+ * can be used for computing the output size of each string as well as create the output
+ *
+ * The `size_and_exec_fn` is expected declare an operator() function with a size_type parameter
+ * and 3 member variables:
+ * - `d_sizes`: output size in bytes of each output row for the 1st pass call
+ * - `d_chars`: output buffer for new string data for the 2nd pass call
+ * - `d_offsets`: used for addressing the specific output row data in `d_chars`
+ *
+ * The 1st pass call computes the output sizes and is identified by `d_chars==nullptr`.
+ * Null rows should be set with an output size of 0.
+ *
+ * @code{.cpp}
+ * struct size_and_exec_fn {
+ *  size_type* d_sizes;
+ *  char* d_chars;
+ *  input_offsetalator d_offsets;
+ *
+ *   __device__ void operator()(size_type thread_idx)
+ *   {
+ *     // functor-specific logic to resolve out_idx from thread_idx
+ *     if( !d_chars ) {
+ *       d_sizes[out_idx] = output_size;
+ *     } else {
+ *       auto d_output = d_chars + d_offsets[out_idx];
+ *       // write characters to d_output
+ *     }
+ *   }
+ * };
+ * @endcode
+ *
+ * @tparam SizeAndExecuteFunction Functor type with an operator() function accepting
+ *         an index parameter and three member variables: `size_type* d_sizes`
+ *         `char* d_chars`, and `input_offsetalator d_offsets`.
+ *
+ * @param size_and_exec_fn This is called twice. Once for the output size of each string
+ *        and once again to fill in the memory pointed to by d_chars.
+ * @param exec_size Number of threads for executing the `size_and_exec_fn` function
+ * @param strings_count Number of strings
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned columns' device memory
+ * @return Offsets child column and chars vector for creating a strings column
+ */
+template <typename SizeAndExecuteFunction>
+auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
+                           size_type exec_size,
+                           size_type strings_count,
+                           rmm::cuda_stream_view stream,
+                           rmm::device_async_resource_ref mr)
+{
+  // This is called twice -- once for computing sizes and once for writing chars.
+  // Reducing the number of places size_and_exec_fn is inlined speeds up compile time.
+  auto for_each_fn = [exec_size, stream](SizeAndExecuteFunction& size_and_exec_fn) {
+    auto constexpr block_size = 256;
+    auto grid                 = cudf::detail::grid_1d{exec_size, block_size};
+    strings_children_kernel<<<grid.num_blocks, block_size, 0, stream.value()>>>(size_and_exec_fn,
+                                                                                exec_size);
+  };
+
+  // Compute the output sizes
+  auto output_sizes        = rmm::device_uvector<size_type>(strings_count, stream);
+  size_and_exec_fn.d_sizes = output_sizes.data();
+  size_and_exec_fn.d_chars = nullptr;
+  for_each_fn(size_and_exec_fn);
+
+  // Convert the sizes to offsets
+  auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
+    output_sizes.begin(), output_sizes.end(), stream, mr);
+  size_and_exec_fn.d_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
+
+  // Now build the chars column
+  rmm::device_uvector<char> chars(bytes, stream, mr);
+  size_and_exec_fn.d_chars = chars.data();
+
+  // Execute the function fn again to fill in the chars data.
+  if (bytes > 0) { for_each_fn(size_and_exec_fn); }
+
+  return std::pair(std::move(offsets_column), std::move(chars));
+}
+
+/**
+ * @brief Creates child offsets and chars columns by applying the template function that
+ * can be used for computing the output size of each string as well as create the output
+ *
+ * The `size_and_exec_fn` is expected declare an operator() function with a size_type parameter
+ * and 3 member variables:
+ * - `d_sizes`: output size in bytes of each output row for the 1st pass call
+ * - `d_chars`: output buffer for new string data for the 2nd pass call
+ * - `d_offsets`: used for addressing the specific output row data in `d_chars`
+ *
+ * The 1st pass call computes the output sizes and is identified by `d_chars==nullptr`.
+ * Null rows should be set with an output size of 0.
+ *
+ * @code{.cpp}
+ * struct size_and_exec_fn {
+ *  size_type* d_sizes;
+ *  char* d_chars;
+ *  input_offsetalator d_offsets;
+ *
+ *   __device__ void operator()(size_type idx)
+ *   {
+ *     if( !d_chars ) {
+ *       d_sizes[idx] = output_size;
+ *     } else {
+ *       auto d_output = d_chars + d_offsets[idx];
+ *       // write characters to d_output
+ *     }
+ *   }
+ * };
+ * @endcode
+ *
+ * @tparam SizeAndExecuteFunction Functor type with an operator() function accepting
+ *         an index parameter and three member variables: `size_type* d_sizes`
+ *         `char* d_chars`, and `input_offsetalator d_offsets`.
+ *
+ * @param size_and_exec_fn This is called twice. Once for the output size of each string
+ *        and once again to fill in the memory pointed to by `d_chars`.
+ * @param strings_count Number of strings
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned columns' device memory
+ * @return Offsets child column and chars vector for creating a strings column
+ */
+template <typename SizeAndExecuteFunction>
+auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
+                           size_type strings_count,
+                           rmm::cuda_stream_view stream,
+                           rmm::device_async_resource_ref mr)
+{
+  return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr);
+}
+
 }  // namespace detail
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/detail/strings_children_ex.cuh b/cpp/include/cudf/strings/detail/strings_children_ex.cuh
deleted file mode 100644
index 6028c7e2437..00000000000
--- a/cpp/include/cudf/strings/detail/strings_children_ex.cuh
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/column/column.hpp>
-#include <cudf/detail/offsets_iterator_factory.cuh>
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/strings/detail/strings_children.cuh>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-
-namespace cudf {
-namespace strings {
-namespace detail {
-namespace experimental {
-
-/**
- * @brief Kernel used by make_strings_children for calling the given functor
- *
- * @tparam SizeAndExecuteFunction Functor type to call in each thread
- *
- * @param fn Functor to call in each thread
- * @param exec_size Total number of threads to be processed by this kernel
- */
-template <typename SizeAndExecuteFunction>
-CUDF_KERNEL void strings_children_kernel(SizeAndExecuteFunction fn, size_type exec_size)
-{
-  auto tid = cudf::detail::grid_1d::global_thread_id();
-  if (tid < exec_size) { fn(tid); }
-}
-
-/**
- * @brief Creates child offsets and chars data by applying the template function that
- * can be used for computing the output size of each string as well as create the output
- *
- * The `size_and_exec_fn` is expected declare an operator() function with a size_type parameter
- * and 3 member variables:
- * - `d_sizes`: output size in bytes of each output row for the 1st pass call
- * - `d_chars`: output buffer for new string data for the 2nd pass call
- * - `d_offsets`: used for addressing the specific output row data in `d_chars`
- *
- * The 1st pass call computes the output sizes and is identified by `d_chars==nullptr`.
- * Null rows should be set with an output size of 0.
- *
- * @code{.cpp}
- * struct size_and_exec_fn {
- *  size_type* d_sizes;
- *  char* d_chars;
- *  input_offsetalator d_offsets;
- *
- *   __device__ void operator()(size_type thread_idx)
- *   {
- *     // functor-specific logic to resolve out_idx from thread_idx
- *     if( !d_chars ) {
- *       d_sizes[out_idx] = output_size;
- *     } else {
- *       auto d_output = d_chars + d_offsets[out_idx];
- *       // write characters to d_output
- *     }
- *   }
- * };
- * @endcode
- *
- * @tparam SizeAndExecuteFunction Functor type with an operator() function accepting
- *         an index parameter and three member variables: `size_type* d_sizes`
- *         `char* d_chars`, and `input_offsetalator d_offsets`.
- *
- * @param size_and_exec_fn This is called twice. Once for the output size of each string
- *        and once again to fill in the memory pointed to by d_chars.
- * @param exec_size Number of threads for executing the `size_and_exec_fn` function
- * @param strings_count Number of strings
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned columns' device memory
- * @return Offsets child column and chars vector for creating a strings column
- */
-template <typename SizeAndExecuteFunction>
-auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
-                           size_type exec_size,
-                           size_type strings_count,
-                           rmm::cuda_stream_view stream,
-                           rmm::device_async_resource_ref mr)
-{
-  // This is called twice -- once for computing sizes and once for writing chars.
-  // Reducing the number of places size_and_exec_fn is inlined speeds up compile time.
-  auto for_each_fn = [exec_size, stream](SizeAndExecuteFunction& size_and_exec_fn) {
-    auto constexpr block_size = 256;
-    auto grid                 = cudf::detail::grid_1d{exec_size, block_size};
-    strings_children_kernel<<<grid.num_blocks, block_size, 0, stream.value()>>>(size_and_exec_fn,
-                                                                                exec_size);
-  };
-
-  // Compute the output sizes
-  auto output_sizes        = rmm::device_uvector<size_type>(strings_count, stream);
-  size_and_exec_fn.d_sizes = output_sizes.data();
-  size_and_exec_fn.d_chars = nullptr;
-  for_each_fn(size_and_exec_fn);
-
-  // Convert the sizes to offsets
-  auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
-    output_sizes.begin(), output_sizes.end(), stream, mr);
-  size_and_exec_fn.d_offsets =
-    cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
-
-  // Now build the chars column
-  rmm::device_uvector<char> chars(bytes, stream, mr);
-  size_and_exec_fn.d_chars = chars.data();
-
-  // Execute the function fn again to fill in the chars data.
-  if (bytes > 0) { for_each_fn(size_and_exec_fn); }
-
-  return std::pair(std::move(offsets_column), std::move(chars));
-}
-
-/**
- * @brief Creates child offsets and chars columns by applying the template function that
- * can be used for computing the output size of each string as well as create the output
- *
- * The `size_and_exec_fn` is expected declare an operator() function with a size_type parameter
- * and 3 member variables:
- * - `d_sizes`: output size in bytes of each output row for the 1st pass call
- * - `d_chars`: output buffer for new string data for the 2nd pass call
- * - `d_offsets`: used for addressing the specific output row data in `d_chars`
- *
- * The 1st pass call computes the output sizes and is identified by `d_chars==nullptr`.
- * Null rows should be set with an output size of 0.
- *
- * @code{.cpp}
- * struct size_and_exec_fn {
- *  size_type* d_sizes;
- *  char* d_chars;
- *  input_offsetalator d_offsets;
- *
- *   __device__ void operator()(size_type idx)
- *   {
- *     if( !d_chars ) {
- *       d_sizes[idx] = output_size;
- *     } else {
- *       auto d_output = d_chars + d_offsets[idx];
- *       // write characters to d_output
- *     }
- *   }
- * };
- * @endcode
- *
- * @tparam SizeAndExecuteFunction Functor type with an operator() function accepting
- *         an index parameter and three member variables: `size_type* d_sizes`
- *         `char* d_chars`, and `input_offsetalator d_offsets`.
- *
- * @param size_and_exec_fn This is called twice. Once for the output size of each string
- *        and once again to fill in the memory pointed to by `d_chars`.
- * @param strings_count Number of strings
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned columns' device memory
- * @return Offsets child column and chars vector for creating a strings column
- */
-template <typename SizeAndExecuteFunction>
-auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
-                           size_type strings_count,
-                           rmm::cuda_stream_view stream,
-                           rmm::device_async_resource_ref mr)
-{
-  return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr);
-}
-
-}  // namespace experimental
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 58a74654405..7c4d5711281 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -33,7 +33,7 @@
 #include <cudf/strings/detail/combine.hpp>
 #include <cudf/strings/detail/converters.hpp>
 #include <cudf/strings/detail/replace.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
@@ -183,7 +183,7 @@ struct column_to_strings_fn {
     auto d_column = column_device_view::create(column_v, stream_);
     escape_strings_fn fn{*d_column, delimiter.value(stream_)};
     auto [offsets_column, chars] =
-      cudf::strings::detail::experimental::make_strings_children(fn, column_v.size(), stream_, mr_);
+      cudf::strings::detail::make_strings_children(fn, column_v.size(), stream_, mr_);
 
     return make_strings_column(column_v.size(),
                                std::move(offsets_column),
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index cac7149dabe..997d6fd99f8 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -36,7 +36,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/detail/combine.hpp>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table.hpp>
@@ -171,8 +171,8 @@ struct escape_strings_fn {
                                               rmm::cuda_stream_view stream,
                                               rmm::device_async_resource_ref mr)
   {
-    auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
-      *this, column_v.size(), stream, mr);
+    auto [offsets_column, chars] =
+      cudf::strings::detail::make_strings_children(*this, column_v.size(), stream, mr);
 
     return make_strings_column(column_v.size(),
                                std::move(offsets_column),
diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu
index 031fff4086a..3f7a98381b8 100644
--- a/cpp/src/strings/capitalize.cu
+++ b/cpp/src/strings/capitalize.cu
@@ -20,7 +20,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/capitalize.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -231,8 +231,7 @@ std::unique_ptr<column> capitalizer(CapitalFn cfn,
                                     rmm::cuda_stream_view stream,
                                     rmm::device_async_resource_ref mr)
 {
-  auto [offsets_column, chars] =
-    cudf::strings::detail::experimental::make_strings_children(cfn, input.size(), stream, mr);
+  auto [offsets_column, chars] = make_strings_children(cfn, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
                              std::move(offsets_column),
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 5d5e6ba9a3e..77c014301ba 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -23,7 +23,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/case.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -296,8 +296,7 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
 
   // For smaller strings, use the regular string-parallel algorithm
   if ((chars_size / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
-    auto [offsets, chars] = cudf::strings::detail::experimental::make_strings_children(
-      converter, input.size(), stream, mr);
+    auto [offsets, chars] = make_strings_children(converter, input.size(), stream, mr);
     return make_strings_column(input.size(),
                                std::move(offsets),
                                chars.release(),
@@ -365,8 +364,7 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
   // run case conversion over the new sub-strings
   auto const tmp_size = static_cast<size_type>(tmp_offsets.size()) - 1;
   upper_lower_ls_fn sub_conv{ccfn, input_chars, tmp_offsets.data()};
-  auto chars = std::get<1>(
-    cudf::strings::detail::experimental::make_strings_children(sub_conv, tmp_size, stream, mr));
+  auto chars = std::get<1>(make_strings_children(sub_conv, tmp_size, stream, mr));
 
   return make_strings_column(input.size(),
                              std::move(offsets),
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index 7716cf0cc29..58137aced0f 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -21,7 +21,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/char_types/char_types.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -202,8 +202,7 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
   rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // this utility calls filterer to build the offsets and chars columns
-  auto [offsets_column, chars] =
-    cudf::strings::detail::experimental::make_strings_children(filterer, strings_count, stream, mr);
+  auto [offsets_column, chars] = make_strings_children(filterer, strings_count, stream, mr);
 
   // return new strings column
   return make_strings_column(strings_count,
diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu
index 97008fa94f8..a2c77c5e77f 100644
--- a/cpp/src/strings/combine/concatenate.cu
+++ b/cpp/src/strings/combine/concatenate.cu
@@ -22,7 +22,7 @@
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/detail/combine.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -145,7 +145,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
   // Create device views from the strings columns.
   auto d_table = table_device_view::create(strings_columns, stream);
   concat_strings_fn fn{*d_table, d_separator, d_narep, separate_nulls};
-  auto [offsets_column, chars] = experimental::make_strings_children(fn, strings_count, stream, mr);
+  auto [offsets_column, chars] = make_strings_children(fn, strings_count, stream, mr);
 
   // create resulting null mask
   auto [null_mask, null_count] = cudf::detail::valid_if(
@@ -237,8 +237,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
 
   multi_separator_concat_fn mscf{
     *d_table, separator_col_view, separator_rep, col_rep, separate_nulls};
-  auto [offsets_column, chars] =
-    experimental::make_strings_children(mscf, strings_count, stream, mr);
+  auto [offsets_column, chars] = make_strings_children(mscf, strings_count, stream, mr);
 
   // Create resulting null mask
   auto [null_mask, null_count] = cudf::detail::valid_if(
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index 2e30e01df21..c4cc0dbe09d 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -22,7 +22,7 @@
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/detail/combine.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -150,7 +150,7 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
     if ((input.size() == input.null_count()) ||
         ((input.chars_size(stream) / (input.size() - input.null_count())) <=
          AVG_CHAR_BYTES_THRESHOLD)) {
-      return std::get<1>(experimental::make_strings_children(
+      return std::get<1>(make_strings_children(
                            join_fn{*d_strings, d_separator, d_narep}, input.size(), stream, mr))
         .release();
     }
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
index b0073452741..f5dfc1a2012 100644
--- a/cpp/src/strings/combine/join_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -22,7 +22,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/combine.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -209,7 +209,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                                     separate_nulls,
                                                     empty_list_policy};
 
-  auto [offsets_column, chars] = experimental::make_strings_children(comp_fn, num_rows, stream, mr);
+  auto [offsets_column, chars] = make_strings_children(comp_fn, num_rows, stream, mr);
   auto [null_mask, null_count] =
     cudf::detail::valid_if(thrust::counting_iterator<size_type>(0),
                            thrust::counting_iterator<size_type>(num_rows),
@@ -284,7 +284,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                                     separate_nulls,
                                                     empty_list_policy};
 
-  auto [offsets_column, chars] = experimental::make_strings_children(comp_fn, num_rows, stream, mr);
+  auto [offsets_column, chars] = make_strings_children(comp_fn, num_rows, stream, mr);
   auto [null_mask, null_count] =
     cudf::detail::valid_if(thrust::counting_iterator<size_type>(0),
                            thrust::counting_iterator<size_type>(num_rows),
diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu
index 6b64006fa24..d4ccb685061 100644
--- a/cpp/src/strings/convert/convert_booleans.cu
+++ b/cpp/src/strings/convert/convert_booleans.cu
@@ -20,7 +20,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_booleans.hpp>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -140,8 +140,8 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
   // copy null mask
   rmm::device_buffer null_mask = cudf::detail::copy_bitmask(booleans, stream, mr);
 
-  auto [offsets, chars] = experimental::make_strings_children(
-    from_booleans_fn{d_column, d_true, d_false}, strings_count, stream, mr);
+  auto [offsets, chars] =
+    make_strings_children(from_booleans_fn{d_column, d_true, d_false}, strings_count, stream, mr);
 
   return make_strings_column(strings_count,
                              std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index ddf68eae951..2f4ebf97264 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/convert/convert_datetime.hpp>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -1109,7 +1109,7 @@ struct dispatch_from_timestamps_fn {
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr) const
   {
-    return experimental::make_strings_children(
+    return make_strings_children(
       datetime_formatter_fn<T>{d_timestamps, d_format_names, d_format_items},
       d_timestamps.size(),
       stream,
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index faf9a83f016..2e4a776d3c0 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -17,7 +17,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/convert/int_to_string.cuh>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
@@ -414,11 +414,11 @@ struct dispatch_from_durations_fn {
     // copy null mask
     rmm::device_buffer null_mask = cudf::detail::copy_bitmask(durations, stream, mr);
 
-    auto [offsets, chars] = experimental::make_strings_children(
-      from_durations_fn<T>{d_column, d_format_items, compiler.items_count()},
-      strings_count,
-      stream,
-      mr);
+    auto [offsets, chars] =
+      make_strings_children(from_durations_fn<T>{d_column, d_format_items, compiler.items_count()},
+                            strings_count,
+                            stream,
+                            mr);
 
     return make_strings_column(strings_count,
                                std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index 34f81b8b407..73089ad407e 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -23,7 +23,7 @@
 #include <cudf/strings/detail/convert/fixed_point.cuh>
 #include <cudf/strings/detail/convert/fixed_point_to_string.cuh>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -242,8 +242,8 @@ struct dispatch_from_fixed_point_fn {
 
     auto const d_column = column_device_view::create(input, stream);
 
-    auto [offsets, chars] = experimental::make_strings_children(
-      from_fixed_point_fn<DecimalType>{*d_column}, input.size(), stream, mr);
+    auto [offsets, chars] =
+      make_strings_children(from_fixed_point_fn<DecimalType>{*d_column}, input.size(), stream, mr);
 
     return make_strings_column(input.size(),
                                std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index 0ed80b976fd..bd7b411d3c3 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -21,7 +21,7 @@
 #include <cudf/strings/convert/convert_floats.hpp>
 #include <cudf/strings/detail/convert/string_to_float.cuh>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -403,8 +403,8 @@ struct dispatch_from_floats_fn {
     // copy the null mask
     rmm::device_buffer null_mask = cudf::detail::copy_bitmask(floats, stream, mr);
 
-    auto [offsets, chars] = experimental::make_strings_children(
-      from_floats_fn<FloatType>{d_column}, strings_count, stream, mr);
+    auto [offsets, chars] =
+      make_strings_children(from_floats_fn<FloatType>{d_column}, strings_count, stream, mr);
 
     return make_strings_column(strings_count,
                                std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu
index 1f9fc3858f8..a34b148a951 100644
--- a/cpp/src/strings/convert/convert_hex.cu
+++ b/cpp/src/strings/convert/convert_hex.cu
@@ -19,7 +19,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_integers.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -182,8 +182,8 @@ struct dispatch_integers_to_hex_fn {
   {
     auto const d_column = column_device_view::create(input, stream);
 
-    auto [offsets_column, chars] = experimental::make_strings_children(
-      integer_to_hex_fn<IntegerType>{*d_column}, input.size(), stream, mr);
+    auto [offsets_column, chars] =
+      make_strings_children(integer_to_hex_fn<IntegerType>{*d_column}, input.size(), stream, mr);
 
     return make_strings_column(input.size(),
                                std::move(offsets_column),
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index 918369ead4d..aeabc71d300 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -23,7 +23,7 @@
 #include <cudf/strings/detail/convert/int_to_string.cuh>
 #include <cudf/strings/detail/convert/string_to_int.cuh>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -362,8 +362,8 @@ struct dispatch_from_integers_fn {
     // copy the null mask
     rmm::device_buffer null_mask = cudf::detail::copy_bitmask(integers, stream, mr);
 
-    auto [offsets, chars] = experimental::make_strings_children(
-      from_integers_fn<IntegerType>{d_column}, strings_count, stream, mr);
+    auto [offsets, chars] =
+      make_strings_children(from_integers_fn<IntegerType>{d_column}, strings_count, stream, mr);
 
     return make_strings_column(strings_count,
                                std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index 33f6c553001..68a24e000ae 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -20,7 +20,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_ipv4.hpp>
 #include <cudf/strings/detail/convert/int_to_string.cuh>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -167,9 +167,9 @@ std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
 
   CUDF_EXPECTS(integers.type().id() == type_id::INT64, "Input column must be type_id::INT64 type");
 
-  auto d_column                = column_device_view::create(integers, stream);
-  auto [offsets_column, chars] = experimental::make_strings_children(
-    integers_to_ipv4_fn{*d_column}, integers.size(), stream, mr);
+  auto d_column = column_device_view::create(integers, stream);
+  auto [offsets_column, chars] =
+    make_strings_children(integers_to_ipv4_fn{*d_column}, integers.size(), stream, mr);
 
   return make_strings_column(integers.size(),
                              std::move(offsets_column),
diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu
index 198e6c11ef3..604f928430b 100644
--- a/cpp/src/strings/convert/convert_lists.cu
+++ b/cpp/src/strings/convert/convert_lists.cu
@@ -17,7 +17,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_lists.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
@@ -218,7 +218,7 @@ std::unique_ptr<column> format_list_column(lists_column_view const& input,
   auto const d_separators = column_device_view::create(separators.parent(), stream);
   auto const d_na_rep     = na_rep.value(stream);
 
-  auto [offsets_column, chars] = experimental::make_strings_children(
+  auto [offsets_column, chars] = make_strings_children(
     format_lists_fn{*d_input, *d_separators, d_na_rep, stack_buffer.data(), depth},
     input.size(),
     stream,
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index d9920be045f..39907a38f2f 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/strings/convert/convert_urls.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -134,7 +134,7 @@ std::unique_ptr<column> url_encode(strings_column_view const& input,
   auto d_column = column_device_view::create(input.parent(), stream);
 
   auto [offsets_column, chars] =
-    experimental::make_strings_children(url_encoder_fn{*d_column}, input.size(), stream, mr);
+    make_strings_children(url_encoder_fn{*d_column}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
                              std::move(offsets_column),
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index 4705ae519cd..a34828fa97e 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -141,8 +141,7 @@ std::unique_ptr<column> filter_characters(
 
   // this utility calls the strip_fn to build the offsets and chars columns
   filter_fn ffn{*d_strings, keep_characters, table.begin(), table.end(), d_replacement};
-  auto [offsets_column, chars] =
-    cudf::strings::detail::experimental::make_strings_children(ffn, strings.size(), stream, mr);
+  auto [offsets_column, chars] = make_strings_children(ffn, strings.size(), stream, mr);
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index 3cfbf79a8f3..0d146108436 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -19,7 +19,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/pad_impl.cuh>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/padding.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -117,13 +117,13 @@ std::unique_ptr<column> pad(strings_column_view const& input,
   auto [offsets_column, chars] = [&] {
     if (side == side_type::LEFT) {
       auto fn = pad_fn<side_type::LEFT>{*d_strings, width, fill_char_size, d_fill_char};
-      return experimental::make_strings_children(fn, input.size(), stream, mr);
+      return make_strings_children(fn, input.size(), stream, mr);
     } else if (side == side_type::RIGHT) {
       auto fn = pad_fn<side_type::RIGHT>{*d_strings, width, fill_char_size, d_fill_char};
-      return experimental::make_strings_children(fn, input.size(), stream, mr);
+      return make_strings_children(fn, input.size(), stream, mr);
     }
     auto fn = pad_fn<side_type::BOTH>{*d_strings, width, fill_char_size, d_fill_char};
-    return experimental::make_strings_children(fn, input.size(), stream, mr);
+    return make_strings_children(fn, input.size(), stream, mr);
   }();
 
   return make_strings_column(input.size(),
@@ -154,7 +154,7 @@ std::unique_ptr<column> zfill(strings_column_view const& input,
 
   auto d_strings = column_device_view::create(input.parent(), stream);
   auto [offsets_column, chars] =
-    experimental::make_strings_children(zfill_fn{*d_strings, width}, input.size(), stream, mr);
+    make_strings_children(zfill_fn{*d_strings, width}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
                              std::move(offsets_column),
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index de1d5e38e00..022f1eb3232 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -21,7 +21,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/offsets_iterator.cuh>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -166,8 +166,8 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
   auto const strings_dv_ptr = column_device_view::create(input.parent(), stream);
   auto const fn = compute_size_and_repeat_fn{*strings_dv_ptr, repeat_times, input.has_nulls()};
 
-  auto [offsets_column, chars] = experimental::make_strings_children(
-    fn, strings_count * repeat_times, strings_count, stream, mr);
+  auto [offsets_column, chars] =
+    make_strings_children(fn, strings_count * repeat_times, strings_count, stream, mr);
   return make_strings_column(strings_count,
                              std::move(offsets_column),
                              chars.release(),
@@ -251,7 +251,7 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                                              input.has_nulls(),
                                                              repeat_times.has_nulls()};
 
-  auto [offsets_column, chars] = experimental::make_strings_children(fn, strings_count, stream, mr);
+  auto [offsets_column, chars] = make_strings_children(fn, strings_count, stream, mr);
 
   // We generate new bitmask by AND of the two input columns' bitmasks.
   // Note that if either of the input columns are nullable, the output column will also be nullable
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 9abcca7a5e6..9025234aa52 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -23,7 +23,7 @@
 #include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/replace.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/replace.hpp>
@@ -462,7 +462,7 @@ std::unique_ptr<column> replace_string_parallel(strings_column_view const& input
   auto d_targets      = column_device_view::create(targets.parent(), stream);
   auto d_replacements = column_device_view::create(repls.parent(), stream);
 
-  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
+  auto [offsets_column, chars] = make_strings_children(
     replace_multi_fn{*d_strings, *d_targets, *d_replacements}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index b9a3acf747f..cd60a4296b9 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -23,7 +23,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/replace_re.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -187,7 +187,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
 
   auto found_ranges = rmm::device_uvector<found_range>(d_progs.size() * input.size(), stream);
 
-  auto [offsets_column, chars] = experimental::make_strings_children(
+  auto [offsets_column, chars] = make_strings_children(
     replace_multi_regex_fn{*d_strings, d_progs, found_ranges.data(), *d_repls},
     input.size(),
     stream,
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index df8526fa942..501e6d547e6 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/strings/detail/replace.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/replace.hpp>
@@ -399,7 +399,7 @@ std::unique_ptr<column> replace_string_parallel(strings_column_view const& input
 {
   auto d_strings = column_device_view::create(input.parent(), stream);
 
-  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
+  auto [offsets_column, chars] = make_strings_children(
     replace_fn{*d_strings, d_target, d_replacement, maxrepl}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
diff --git a/cpp/src/strings/replace/replace_slice.cu b/cpp/src/strings/replace/replace_slice.cu
index 54e84dfe504..04d81218a16 100644
--- a/cpp/src/strings/replace/replace_slice.cu
+++ b/cpp/src/strings/replace/replace_slice.cu
@@ -19,7 +19,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/replace.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/replace.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -95,7 +95,7 @@ std::unique_ptr<column> replace_slice(strings_column_view const& input,
   auto d_strings = column_device_view::create(input.parent(), stream);
 
   // this utility calls the given functor to build the offsets and chars columns
-  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
+  auto [offsets_column, chars] = make_strings_children(
     replace_slice_fn{*d_strings, d_repl, start, stop}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index 972a4ffd58e..cf82a837c51 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -21,7 +21,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/slice.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -208,7 +208,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
   auto const d_stop  = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(stop));
   auto const d_step  = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(step));
 
-  auto [offsets, chars] = experimental::make_strings_children(
+  auto [offsets, chars] = make_strings_children(
     substring_fn{*d_column, d_start, d_stop, d_step}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index 75bc46d30c4..16b22d0de4c 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -20,7 +20,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/translate.hpp>
@@ -112,7 +112,7 @@ std::unique_ptr<column> translate(strings_column_view const& strings,
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
-  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
+  auto [offsets_column, chars] = make_strings_children(
     translate_fn{*d_strings, table.begin(), table.end()}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index 2efeeee0ee9..6635b61093e 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -20,7 +20,7 @@
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sorting.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -158,7 +158,7 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string
 
   cudf::string_view const d_separator(separator.data(), separator.size());
 
-  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
     detokenizer_fn{*strings_column, d_row_map, tokens_offsets.data(), d_separator},
     output_count,
     stream,
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index fdd165a54bc..724f3603f29 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -142,7 +142,7 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
   // compute the number of strings of ngrams
   auto const ngrams_count = strings_count - ngrams + 1;
 
-  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
     ngram_generator_fn{d_strings, ngrams, d_separator}, ngrams_count, stream, mr);
 
   // make the output strings column from the offsets and chars column
@@ -235,7 +235,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
                "Insufficient number of characters in each string to generate ngrams");
 
   character_ngram_generator_fn generator{*d_strings, ngrams, d_offsets};
-  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
     generator, strings_count, total_ngrams, stream, mr);
 
   auto output = cudf::make_strings_column(
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 2f97eb1ce74..4db11dc5beb 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -26,7 +26,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -185,7 +185,7 @@ std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const&
   auto d_strings = cudf::column_device_view::create(strings.parent(), stream);
 
   // build offsets and children using the normalize_space_fn
-  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
     normalize_spaces_fn{*d_strings}, strings.size(), stream, mr);
 
   return cudf::make_strings_column(strings.size(),
@@ -227,7 +227,7 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   auto d_strings = cudf::column_device_view::create(strings.parent(), stream);
 
   // build offsets and children using the codepoint_to_utf8_fn
-  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
     codepoint_to_utf8_fn{*d_strings, cp_chars, cp_offsets}, strings.size(), stream, mr);
 
   return cudf::make_strings_column(strings.size(),
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index f95b53a3ac8..84ed1827117 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -21,7 +21,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -232,7 +232,7 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
 
   // this utility calls replacer to build the offsets and chars columns
   auto [offsets_column, chars] =
-    cudf::strings::detail::experimental::make_strings_children(replacer, strings_count, stream, mr);
+    cudf::strings::detail::make_strings_children(replacer, strings_count, stream, mr);
 
   // return new strings column
   return cudf::make_strings_column(strings_count,
@@ -265,7 +265,7 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& str
 
   // this utility calls filterer to build the offsets and chars columns
   auto [offsets_column, chars] =
-    cudf::strings::detail::experimental::make_strings_children(filterer, strings_count, stream, mr);
+    cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);
 
   // return new strings column
   return cudf::make_strings_column(strings_count,

From 915c6bea2069f75b5637ff39befd877ba37a1922 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Mon, 13 May 2024 10:19:24 -0400
Subject: [PATCH 211/272] Correct static builds + static arrow (#15715)

Correct the CMake logic in arrow so that we can properly build cudf + arrow statically

Fixes https://github.com/rapidsai/cudf/issues/15714

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cudf/pull/15715
---
 ci/configure_cpp_static.sh           |  2 +-
 cpp/cmake/thirdparty/get_arrow.cmake | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/ci/configure_cpp_static.sh b/ci/configure_cpp_static.sh
index d1f9e0d1399..11d5585d98f 100755
--- a/ci/configure_cpp_static.sh
+++ b/ci/configure_cpp_static.sh
@@ -18,4 +18,4 @@ rapids-dependency-file-generator \
 python -m pip install -r "${REQUIREMENTS_FILE}"
 pyenv rehash
 
-cmake -S cpp -B build_static -GNinja -DBUILD_SHARED_LIBS=OFF -DBUILD_TESTS=OFF
+cmake -S cpp -B build_static -GNinja -DBUILD_SHARED_LIBS=OFF -DCUDF_USE_ARROW_STATIC=ON -DBUILD_TESTS=OFF
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 70283efbd79..e9d2f479088 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -303,7 +303,20 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
         "
       )
     endif()
-
+    rapids_cmake_install_lib_dir(lib_dir)
+    if(TARGET arrow_static)
+      get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES)
+      # The `arrow_static` library is leaking a dependency on the object libraries it was built with
+      # we need to remove this from the interface, since keeping them around would cause duplicate
+      # symbols and CMake export errors
+      if(interface_libs MATCHES "arrow_array" AND interface_libs MATCHES "arrow_compute")
+        string(REPLACE "BUILD_INTERFACE:" "BUILD_LOCAL_INTERFACE:" interface_libs
+                       "${interface_libs}"
+        )
+        set_target_properties(arrow_static PROPERTIES INTERFACE_LINK_LIBRARIES "${interface_libs}")
+        get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES)
+      endif()
+    endif()
     rapids_export(
       BUILD Arrow
       VERSION ${VERSION}

From 149253b2e9f3801fdcc88c17e31a25788fe6381a Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 13 May 2024 16:08:05 +0100
Subject: [PATCH 212/272] Skeleton cudf polars package (#15688)

Introduce the skeleton of a cudf_polars package. Note that we are deliberately not building any packages yet.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Mike Sarahan (https://github.com/msarahan)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15688
---
 .pre-commit-config.yaml                    |   3 +
 dependencies.yaml                          |  28 ++++
 python/cudf_polars/LICENSE                 |   1 +
 python/cudf_polars/README.md               |   1 +
 python/cudf_polars/cudf_polars/VERSION     |   1 +
 python/cudf_polars/cudf_polars/__init__.py |  13 ++
 python/cudf_polars/pyproject.toml          | 171 +++++++++++++++++++++
 7 files changed, 218 insertions(+)
 create mode 120000 python/cudf_polars/LICENSE
 create mode 120000 python/cudf_polars/README.md
 create mode 120000 python/cudf_polars/cudf_polars/VERSION
 create mode 100644 python/cudf_polars/cudf_polars/__init__.py
 create mode 100644 python/cudf_polars/pyproject.toml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0ae745257cb..d44462236b2 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,6 +22,8 @@ repos:
         # project can specify its own first/third-party packages.
         args: ["--config-root=python/", "--resolve-all-configs"]
         files: python/.*
+        exclude: |
+          (?x)^(^python/cudf_polars/.*)
         types_or: [python, cython, pyi]
   - repo: https://github.com/MarcoGorelli/cython-lint
     rev: v0.16.2
@@ -36,6 +38,7 @@ repos:
                "python/cudf/cudf",
                "python/custreamz/custreamz",
                "python/cudf_kafka/cudf_kafka",
+               "python/cudf_polars/cudf_polars",
                "python/dask_cudf/dask_cudf"]
         pass_filenames: false
   - repo: https://github.com/nbQA-dev/nbQA
diff --git a/dependencies.yaml b/dependencies.yaml
index 7fe67817f73..27b0f23389c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -119,6 +119,29 @@ files:
       key: cudf-pandas-tests
     includes:
       - test_python_cudf_pandas
+  py_build_cudf_polars:
+    output: pyproject
+    pyproject_dir: python/cudf_polars
+    extras:
+      table: build-system
+    includes:
+      - build_wheels
+  py_run_cudf_polars:
+    output: pyproject
+    pyproject_dir: python/cudf_polars
+    extras:
+      table: project
+    includes:
+      - run_cudf_polars
+      - depends_on_cudf
+  py_test_cudf_polars:
+    output: pyproject
+    pyproject_dir: python/cudf_polars
+    extras:
+      table: project.optional-dependencies
+      key: test
+    includes:
+      - test_python_common
   py_build_dask_cudf:
     output: pyproject
     pyproject_dir: python/dask_cudf
@@ -559,6 +582,11 @@ dependencies:
       - output_types: pyproject
         matrices:
           - {matrix: null, packages: [cubinlinker, ptxcompiler, *rmm_conda] }
+  run_cudf_polars:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - polars>=0.20.24
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf_polars/LICENSE b/python/cudf_polars/LICENSE
new file mode 120000
index 00000000000..30cff7403da
--- /dev/null
+++ b/python/cudf_polars/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/cudf_polars/README.md b/python/cudf_polars/README.md
new file mode 120000
index 00000000000..fe840054137
--- /dev/null
+++ b/python/cudf_polars/README.md
@@ -0,0 +1 @@
+../../README.md
\ No newline at end of file
diff --git a/python/cudf_polars/cudf_polars/VERSION b/python/cudf_polars/cudf_polars/VERSION
new file mode 120000
index 00000000000..d62dc733efd
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py
new file mode 100644
index 00000000000..74547fe2448
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/__init__.py
@@ -0,0 +1,13 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+An executor for polars logical plans.
+
+This package implements an executor for polars logical plans using
+pylibcudf to execute the plans on device.
+"""
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
new file mode 100644
index 00000000000..de26a3eb51c
--- /dev/null
+++ b/python/cudf_polars/pyproject.toml
@@ -0,0 +1,171 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[build-system]
+build-backend = "setuptools.build_meta"
+requires = [
+    "setuptools",
+    "wheel",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project]
+name = "cudf-polars"
+dynamic = ["version"]
+description = "Executor for polars using cudf"
+readme = { file = "README.md", content-type = "text/markdown" }
+authors = [
+    { name = "NVIDIA Corporation" },
+]
+license = { text = "Apache 2.0" }
+requires-python = ">=3.9"
+dependencies = [
+    "cudf==24.6.*",
+    "polars>=0.20.24",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+classifiers = [
+    "Intended Audience :: Developers",
+    "Topic :: Database",
+    "Topic :: Scientific/Engineering",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+]
+
+[project.optional-dependencies]
+test = [
+    "pytest-cov",
+    "pytest-xdist",
+    "pytest<8",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project.urls]
+Homepage = "https://github.com/rapidsai/cudf"
+
+[tool.setuptools]
+license-files = ["LICENSE"]
+
+[tool.setuptools.dynamic]
+version = {file = "cudf_polars/VERSION"}
+
+[tool.setuptools.packages.find]
+exclude = ["*tests*"]
+
+[tool.ruff]
+line-length = 88
+indent-width = 4
+target-version = "py39"
+fix = true
+
+[tool.ruff.lint]
+# __init__.py must re-export everything it imports
+ignore-init-module-imports = false
+select = [
+  "E", # pycodestyle
+  "W", # pycodestyle
+  "F", # Pyflakes
+  "B", # flake8-bugbear
+  "C4", # flake8-comprehensions
+  "D", # flake8-docstrings
+  "D213", # Augment NumPy docstring convention: Multi-line docstring summary should start at the second line
+  "D417", # Augment NumPy docstring convention: Missing argument descriptions
+  "I", # isort
+  "ISC", # flake8-implicit-str-concat
+  "INP", # flake8-no-pep420 (namespace packages)
+  "SIM", # flake8-simplify
+  "TCH", # flake8-type-checking
+  "TID", # flake8-tidy-imports
+  "PLC", # pylint-convention
+  "PLE", # pylint-error
+  # Not enabling PLR (pylint-refactor) since it conflicts with other rules
+  "PLW", # pylint-warning
+  "PERF", # perflint
+  "UP", # pyupgrade
+  "PT", # flake8-pytest-style
+  # https://docs.astral.sh/ruff/rules/#flake8-return-ret
+  "RET502", # no implicit return
+  "RET503", # no implicit return
+  "RET504", # no implicit return
+  "RUF", # Ruff-specific rules
+  "PTH", # flake8-use-pathlib
+  "FA", # flake8-future-annotations
+  "PIE", # flake8-pie
+  "TD", # flake8-todos
+  "TRY", # tryceratops
+  "FBT", # flake8-boolean-trap
+]
+
+ignore = [
+  # Line length regulated by formatter
+  "E501",
+  # pydocstyle: http://www.pydocstyle.org/en/stable/error_codes.html
+  "D401", # Relax NumPy docstring convention: First line should be in imperative mood
+  # flake8-pytest-style:
+  "PT011", # pytest.raises({exception}) is too broad, set the match parameter or use a more specific exception
+  # flake8-simplify
+  "SIM108", # Use ternary operator
+  # flake8-todos
+  "TD002", # Missing author in TODO
+  "TD003", # Missing issue link on the line following this TODO
+  # tryceratops
+  "TRY003", # Avoid specifying long messages outside the exception class
+  # Lints below are turned off because of conflicts with the ruff
+  # formatter
+  # See https://docs.astral.sh/ruff/formatter/#conflicting-lint-rules
+  "W191", # tab-indentation
+  "E111", # indentation-with-invalid-multiple
+  "E114", # indentation-with-invalid-multiple-comment
+  "E117", # over-indented
+  "D206", # indent-with-spaces
+  "D300", # triple-single-quotes
+  "Q000", # bad-quotes-inline-string
+  "Q001", # bad-quotes-multiline-string
+  "Q002", # bad-quotes-docstring
+  "Q003", # avoidable-escaped-quote
+  "COM812", # missing-trailing-comma
+  "COM819", # prohibited-trailing-comma
+  "ISC001", # single-line-implicit-string-concatenation
+  "ISC002", # multi-line-implicit-string-concatenation
+]
+fixable = ["ALL"]
+
+[tool.ruff.lint.flake8-pytest-style]
+# https://docs.astral.sh/ruff/settings/#lintflake8-pytest-style
+fixture-parentheses = false
+mark-parentheses = false
+parametrize-names-type = "csv"
+parametrize-values-type = "list"
+parametrize-values-row-type = "tuple"
+
+[tool.ruff.lint.pydocstyle]
+convention = "numpy"
+
+[tool.ruff.lint.flake8-tidy-imports]
+ban-relative-imports = "all"
+
+[tool.ruff.lint.flake8-type-checking]
+strict = true
+
+[tool.ruff.lint.isort]
+case-sensitive = true
+combine-as-imports = true
+order-by-type = true
+known-first-party = ["cudf_polars"]
+default-section = "third-party"
+section-order = [
+  "future",
+  "standard-library",
+  "third-party",
+  "polars",
+  "rapids",
+  "first-party",
+  "local-folder"
+]
+required-imports = ["from __future__ import annotations"]
+
+[tool.ruff.lint.isort.sections]
+polars = ["polars"]
+rapids = ["rmm", "cudf"]
+
+[tool.ruff.format]
+docstring-code-format = true

From 13f028f01ad043b0d24f3e4a28f4267c02806390 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 13 May 2024 11:39:50 -0400
Subject: [PATCH 213/272] Update libcudf developer guide for strings offsets
 column (#15661)

Updates the libcudf Developer Guide to better describe the strings offsets child column and include the offsetalator.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15661
---
 .../developer_guide/DEVELOPER_GUIDE.md        | 96 ++++++++++++++-----
 1 file changed, 71 insertions(+), 25 deletions(-)

diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index 05f8e4585cc..ff80c2daab8 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -1,4 +1,4 @@
-# libcudf C++ Developer Guide {#DEVELOPER_GUIDE}
+# libcudf C++ Developer Guide
 
 This document serves as a guide for contributors to libcudf C++ code. Developers should also refer
 to these additional files for further documentation of libcudf best practices.
@@ -828,7 +828,7 @@ This iterator returns the validity of the underlying element (`true` or `false`)
 
 The proliferation of data types supported by libcudf can result in long compile times. One area
 where compile time was a problem is in types used to store indices, which can be any integer type.
-The "Indexalator", or index-normalizing iterator (`include/cudf/detail/indexalator.cuh`), can be
+The "indexalator", or index-normalizing iterator (`include/cudf/detail/indexalator.cuh`), can be
 used for index types (integers) without requiring a type-specific instance. It can be used for any
 iterator interface for reading an array of integer values of type `int8`, `int16`, `int32`,
 `int64`, `uint8`, `uint16`, `uint32`, or `uint64`. Reading specific elements always returns a
@@ -856,6 +856,41 @@ thrust::lower_bound(rmm::exec_policy(stream),
                     thrust::less<Element>());
 ```
 
+### Offset-normalizing iterators
+
+Like the [indexalator](#index-normalizing-iterators),
+the "offsetalator", or offset-normalizing iterator (`include/cudf/detail/offsetalator.cuh`), can be
+used for offset column types (`INT32` or `INT64` only) without requiring a type-specific instance.
+This is helpful when reading or building [strings columns](#strings-columns).
+The normalized type is `int64` which means an `input_offsetsalator` will return `int64` type values
+for both `INT32` and `INT64` offsets columns.
+Likewise, an `output_offselator` can accept `int64` type values to store into either an
+`INT32` or `INT64` output offsets column created appropriately.
+
+Use the `cudf::detail::offsetalator_factory` to create an appropriate input or output iterator from an offsets column_view.
+Example input iterator usage:
+
+```c++
+  // convert the sizes to offsets
+  auto [offsets, char_bytes] = cudf::strings::detail::make_offsets_child_column(
+    output_sizes.begin(), output_sizes.end(), stream, mr);
+  auto d_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
+  // use d_offsets to address the output row bytes
+```
+
+Example output iterator usage:
+
+```c++
+    // create offsets column as either INT32 or INT64 depending on the number of bytes
+    auto offsets_column = cudf::strings::detail::create_offsets_child_column(total_bytes,
+                                                                             offsets_count,
+                                                                             stream, mr);
+    auto d_offsets =
+      cudf::detail::offsetalator_factory::make_output_iterator(offsets_column->mutable_view());
+    // write appropriate offset values to d_offsets
+```
+
 ## Namespaces
 
 ### External
@@ -1241,18 +1276,20 @@ This is related to [Arrow's "Variable-Size List" memory layout](https://arrow.ap
 
 Strings are represented as a column with a data device buffer and a child offsets column.
 The parent column's type is `STRING` and its data holds all the characters across all the strings packed together
-but its size represents the number of strings in the column, and its null mask represents the
-validity of each string. To summarize, the strings column children are:
-
-1. A non-nullable column of [`size_type`](#cudfsize_type) elements that indicates the offset to the beginning of each
-   string in a dense data buffer of all characters.
+but its size represents the number of strings in the column and its null mask represents the
+validity of each string.
 
-With this representation, `data[offsets[i]]` is the first character of string `i`, and the
-size of string `i` is given by `offsets[i+1] - offsets[i]`. The following image shows an example of
-this compound column representation of strings.
+The strings column contains a single, non-nullable child column
+of offset elements that indicates the byte position offset to the beginning of each
+string in the dense data buffer of all characters. With this representation, `data[offsets[i]]` is the
+first character of string `i`, and the size of string `i` is given by `offsets[i+1] - offsets[i]`.
+The following image shows an example of this compound column representation of strings.
 
 ![strings](strings.png)
 
+The type of the offsets column is either `INT32` or `INT64` depending on the number of bytes in the data buffer.
+See [`cudf::strings_view`](#cudfstrings_column_view-and-cudfstring_view) for more information on processing individual string rows.
+
 ## Structs columns
 
 A struct is a nested data type with a set of child columns each representing an individual field
@@ -1295,7 +1332,7 @@ struct column's layout is as follows. (Note that null masks should be read from
 }
 ```
 
-The last struct row (index 3) is not null, but has a null value in the INT32 field. Also, row 2 of
+The last struct row (index 3) is not null, but has a null value in the `INT32` field. Also, row 2 of
 the struct column is null, making its corresponding fields also null. Therefore, bit 2 is unset in
 the null masks of both struct fields.
 
@@ -1351,18 +1388,27 @@ libcudf provides view types for nested column types as well as for the data elem
 
 ### cudf::strings_column_view and cudf::string_view
 
-`cudf::strings_column_view` is a view of a strings column, like `cudf::column_view` is a view of
-any `cudf::column`. `cudf::string_view` is a view of a single string, and therefore
-`cudf::string_view` is the data type of a `cudf::column` of type `STRING` just like `int32_t` is the
-data type for a `cudf::column` of type [`size_type`](#cudfsize_type). As its name implies, this is a
-read-only object instance that points to device memory inside the strings column. It's lifespan is
-the same (or less) as the column it views.
+A `cudf::strings_column_view` wraps a strings column and contains a parent
+`cudf::column_view` as a view of the strings column and an offsets `cudf::column_view`
+which is a child of the parent.
+The parent view contains the offset, size, and validity mask for the strings column.
+The offsets view is non-nullable with `offset()==0` and its own size.
+Since the offset column type can be either `INT32` or `INT64` it is useful to use the
+offset normalizing iterators [offsetalator](#offset-normalizing-iterators) to access individual offset values.
+
+A `cudf::string_view` is a view of a single string and therefore
+is the data type of a `cudf::column` of type `STRING` just like `int32_t` is the
+data type for a `cudf::column` of type `INT32`. As its name implies, this is a
+read-only object instance that points to device memory inside the strings column.
+Its lifespan is the same (or less) as the column it views.
+An individual strings column row and a `cudf::string_view` is limited to [`size_type`](#cudfsize_type) bytes.
 
 Use the `column_device_view::element` method to access an individual row element. Like any other
 column, do not call `element()` on a row that is null.
 
 ```c++
-   cudf::column_device_view d_strings;
+   cudf::strings_column_view scv;
+   auto d_strings = cudf::column_device_view::create(scv.parent(), stream);
    ...
    if( d_strings.is_valid(row_index) ) {
       string_view d_str = d_strings.element<string_view>(row_index);
@@ -1370,27 +1416,27 @@ column, do not call `element()` on a row that is null.
    }
 ```
 
-A null string is not the same as an empty string. Use the `string_scalar` class if you need an
+A null string is not the same as an empty string. Use the `cudf::string_scalar` class if you need an
 instance of a class object to represent a null string.
 
-The `string_view` contains comparison operators `<,>,==,<=,>=` that can be used in many cudf
-functions like `sort` without string-specific code. The data for a `string_view` instance is
+The `cudf::string_view` contains comparison operators `<,>,==,<=,>=` that can be used in many cudf
+functions like `sort` without string-specific code. The data for a `cudf::string_view` instance is
 required to be [UTF-8](#utf-8) and all operators and methods expect this encoding. Unless documented
 otherwise, position and length parameters are specified in characters and not bytes. The class also
-includes a `string_view::const_iterator` which can be used to navigate through individual characters
+includes a `cudf::string_view::const_iterator` which can be used to navigate through individual characters
 within the string.
 
-`cudf::type_dispatcher` dispatches to the `string_view` data type when invoked on a `STRING` column.
+`cudf::type_dispatcher` dispatches to the `cudf::string_view` data type when invoked on a `STRING` column.
 
 #### UTF-8
 
 The libcudf strings column only supports UTF-8 encoding for strings data.
 [UTF-8](https://en.wikipedia.org/wiki/UTF-8) is a variable-length character encoding wherein each
 character can be 1-4 bytes. This means the length of a string is not the same as its size in bytes.
-For this reason, it is recommended to use the `string_view` class to access these characters for
+For this reason, it is recommended to use the `cudf::string_view` class to access these characters for
 most operations.
 
-The `string_view.cuh` header also includes some utility methods for reading and writing
+The `cudf/strings/detail/utf8.hpp` header also includes some utility methods for reading and writing
 (`to_char_utf8/from_char_utf8`) individual UTF-8 characters to/from byte arrays.
 
 ### cudf::lists_column_view and cudf::lists_view

From 38d988bceec620317fb9c267e6dd23c569ffbbf5 Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Mon, 13 May 2024 13:20:02 -0700
Subject: [PATCH 214/272] add --rm and --name to devcontainer run args (#15572)

* Remove the devcontainer when the VSCode window closes
* Adds a descriptive name to the running container:
  ```shell
  $ docker ps -a
  CONTAINER ID   IMAGE         ...  NAMES
  0dbb364fe544   vsc-cudf-...  ...  rapids-cudf-24.06-cuda12.2-conda

  $ docker rm -f rapids-cudf-24.06-cuda12.2-conda
  ```

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/15572
---
 .devcontainer/cuda11.8-conda/devcontainer.json | 5 +++++
 .devcontainer/cuda11.8-pip/devcontainer.json   | 5 +++++
 .devcontainer/cuda12.2-conda/devcontainer.json | 5 +++++
 .devcontainer/cuda12.2-pip/devcontainer.json   | 5 +++++
 ci/release/update-version.sh                   | 1 +
 cpp/scripts/run-cmake-format.sh                | 4 ++--
 6 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 9999eebdc97..944a73ecc98 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -8,6 +8,11 @@
       "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-conda"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 90471e0b750..8b802333bda 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -8,6 +8,11 @@
       "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-pip"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json
index 5a61d26e1f5..886b07025cc 100644
--- a/.devcontainer/cuda12.2-conda/devcontainer.json
+++ b/.devcontainer/cuda12.2-conda/devcontainer.json
@@ -8,6 +8,11 @@
       "BASE": "rapidsai/devcontainers:24.06-cpp-mambaforge-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-conda"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json
index 29817cdadc3..86df56ada19 100644
--- a/.devcontainer/cuda12.2-pip/devcontainer.json
+++ b/.devcontainer/cuda12.2-pip/devcontainer.json
@@ -8,6 +8,11 @@
       "BASE": "rapidsai/devcontainers:24.06-cpp-cuda12.2-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-pip"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 99f9c698217..beeb130f0f1 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -88,4 +88,5 @@ sed_runner "s/cudf-.*-SNAPSHOT/cudf-${NEXT_FULL_JAVA_TAG}/g" java/ci/README.md
 find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do
     sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}"
     sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
+    sed_runner "s@rapids-\${localWorkspaceFolderBasename}-[0-9.]*@rapids-\${localWorkspaceFolderBasename}-${NEXT_SHORT_TAG}@g" "${filename}"
 done
diff --git a/cpp/scripts/run-cmake-format.sh b/cpp/scripts/run-cmake-format.sh
index f3e21779aa5..603880954a6 100755
--- a/cpp/scripts/run-cmake-format.sh
+++ b/cpp/scripts/run-cmake-format.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
-
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 # This script is a wrapper for cmakelang that may be used with pre-commit. The
 # wrapping is necessary because RAPIDS libraries split configuration for
@@ -45,6 +44,7 @@ fi
 
 DEFAULT_FORMAT_FILE_LOCATIONS=(
   "${CUDF_BUILD_DIR:-${HOME}}/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
+  "${CUDF_BUILD_DIR:-cpp/build}/latest/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
   "cpp/libcudf_kafka/build/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
 )
 

From 3a33d51bb8bf95542e27d63ff007989802615a68 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 13 May 2024 14:26:22 -1000
Subject: [PATCH 215/272] Disable pandas 2.x clipboard tests in cudf.pandas
 tests (#15462)

Clipboard testing currently `ERROR` since a `pytest-qt` fixture isn't found.

Even when this fixture is installed these tests seems to consistent crash pytest workers (I suspect since there is some method patching in `test_clipboard.py`. I don't think `cudf.pandas` should realistically work with clipboard functionality so just skipping these tests instead

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15462
---
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index af7fa72d44e..6eb28104120 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -22,7 +22,9 @@ set -euo pipefail
 # of Pandas installed.
 PANDAS_VERSION=$(python -c "import pandas; print(pandas.__version__)")
 
-PYTEST_IGNORES="--ignore=tests/io/parser/common/test_read_errors.py"
+# tests/io/test_clipboard.py::TestClipboard crashes pytest workers (possibly due to fixture patching clipboard functionality)
+PYTEST_IGNORES="--ignore=tests/io/parser/common/test_read_errors.py \
+--ignore=tests/io/test_clipboard.py"
 
 mkdir -p pandas-testing
 cd pandas-testing

From 0f6ce63431cff85a278eafc555e74ee0e101f6da Mon Sep 17 00:00:00 2001
From: Liangcai Li <firestarmanllc@gmail.com>
Date: Tue, 14 May 2024 09:28:44 +0800
Subject: [PATCH 216/272] Add JNI bindings for zstd compression of NVCOMP.
 (#15729)

Authors:
  - Liangcai Li (https://github.com/firestarman)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/15729
---
 .../rapids/cudf/nvcomp/BatchedCompressor.java | 335 ++++++++++++++++++
 .../cudf/nvcomp/BatchedDecompressor.java      | 220 ++++++++++++
 .../cudf/nvcomp/BatchedLZ4Compressor.java     | 310 +---------------
 .../cudf/nvcomp/BatchedLZ4Decompressor.java   | 182 +---------
 .../cudf/nvcomp/BatchedZstdCompressor.java    |  45 +++
 .../cudf/nvcomp/BatchedZstdDecompressor.java  |  37 ++
 .../java/ai/rapids/cudf/nvcomp/NvcompJni.java |  96 ++++-
 java/src/main/native/src/NvcompJni.cpp        | 156 ++++++++
 .../ai/rapids/cudf/nvcomp/NvcompTest.java     |  27 +-
 9 files changed, 943 insertions(+), 465 deletions(-)
 create mode 100644 java/src/main/java/ai/rapids/cudf/nvcomp/BatchedCompressor.java
 create mode 100644 java/src/main/java/ai/rapids/cudf/nvcomp/BatchedDecompressor.java
 create mode 100644 java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdCompressor.java
 create mode 100644 java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdDecompressor.java

diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedCompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedCompressor.java
new file mode 100644
index 00000000000..72dfcdb3cb5
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedCompressor.java
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf.nvcomp;
+
+import ai.rapids.cudf.BaseDeviceMemoryBuffer;
+import ai.rapids.cudf.CloseableArray;
+import ai.rapids.cudf.Cuda;
+import ai.rapids.cudf.DeviceMemoryBuffer;
+import ai.rapids.cudf.HostMemoryBuffer;
+import ai.rapids.cudf.MemoryBuffer;
+import ai.rapids.cudf.NvtxColor;
+import ai.rapids.cudf.NvtxRange;
+
+/** Multi-buffer compressor */
+public abstract class BatchedCompressor {
+
+  static final long MAX_CHUNK_SIZE = 16777216;  // 16MiB in bytes
+  // each chunk has a 64-bit integer value as metadata containing the compressed size
+  static final long METADATA_BYTES_PER_CHUNK = 8;
+
+  private final long chunkSize;
+  private final long maxIntermediateBufferSize;
+  private final long maxOutputChunkSize;
+
+  /**
+   * Construct a batched compressor instance
+   * @param chunkSize maximum amount of uncompressed data to compress as a single chunk.
+   *                  Inputs larger than this will be compressed in multiple chunks.
+   * @param maxIntermediateBufferSize desired maximum size of intermediate device
+   *                                  buffers used during compression.
+   */
+  public BatchedCompressor(long chunkSize, long maxOutputChunkSize,
+      long maxIntermediateBufferSize) {
+    validateChunkSize(chunkSize);
+    assert maxOutputChunkSize < Integer.MAX_VALUE;
+    this.chunkSize = chunkSize;
+    this.maxOutputChunkSize = maxOutputChunkSize;
+    this.maxIntermediateBufferSize = Math.max(maxOutputChunkSize, maxIntermediateBufferSize);
+  }
+
+  /**
+   * Compress a batch of buffers. The input buffers will be closed.
+   * @param origInputs buffers to compress
+   * @param stream CUDA stream to use
+   * @return compressed buffers corresponding to the input buffers
+   */
+  public DeviceMemoryBuffer[] compress(BaseDeviceMemoryBuffer[] origInputs, Cuda.Stream stream) {
+    try (CloseableArray<BaseDeviceMemoryBuffer> inputs = CloseableArray.wrap(origInputs)) {
+      if (chunkSize <= 0) {
+        throw new IllegalArgumentException("Illegal chunk size: " + chunkSize);
+      }
+      final int numInputs = inputs.size();
+      if (numInputs == 0) {
+        return new DeviceMemoryBuffer[0];
+      }
+
+      // Each buffer is broken up into chunkSize chunks for compression.  Calculate how many
+      // chunks are needed for each input buffer.
+      int[] chunksPerInput = new int[numInputs];
+      int numChunks = 0;
+      for (int i = 0; i < numInputs; i++) {
+        BaseDeviceMemoryBuffer buffer = inputs.get(i);
+        int numBufferChunks = getNumChunksInBuffer(buffer);
+        chunksPerInput[i] = numBufferChunks;
+        numChunks += numBufferChunks;
+      }
+
+      // Allocate buffers for each chunk and generate parallel lists of chunk source addresses,
+      // chunk destination addresses, and sizes.
+      try (CloseableArray<DeviceMemoryBuffer> compressedBuffers =
+              allocCompressedBuffers(numChunks, stream);
+           DeviceMemoryBuffer compressedChunkSizes =
+              DeviceMemoryBuffer.allocate(numChunks * 8L, stream)) {
+        long[] inputChunkAddrs = new long[numChunks];
+        long[] inputChunkSizes = new long[numChunks];
+        long[] outputChunkAddrs = new long[numChunks];
+        buildAddrsAndSizes(inputs, inputChunkAddrs, inputChunkSizes, compressedBuffers,
+            outputChunkAddrs);
+
+        final long tempBufferSize = batchedCompressGetTempSize(numChunks, chunkSize);
+        try (DeviceMemoryBuffer addrsAndSizes = putAddrsAndSizesOnDevice(inputChunkAddrs,
+                inputChunkSizes, outputChunkAddrs, stream);
+             DeviceMemoryBuffer tempBuffer =
+                DeviceMemoryBuffer.allocate(tempBufferSize, stream)) {
+          final long devOutputAddrsPtr = addrsAndSizes.getAddress() + numChunks * 8L;
+          final long devInputSizesPtr = devOutputAddrsPtr + numChunks * 8L;
+          batchedCompressAsync(addrsAndSizes.getAddress(), devInputSizesPtr, chunkSize,
+              numChunks, tempBuffer.getAddress(), tempBufferSize, devOutputAddrsPtr,
+              compressedChunkSizes.getAddress(), stream.getStream());
+        }
+
+        // Synchronously copy the resulting compressed sizes per chunk.
+        long[] outputChunkSizes = getOutputChunkSizes(compressedChunkSizes, stream);
+
+        // inputs are no longer needed at this point, so free them early
+        inputs.close();
+
+        // Combine compressed chunks into output buffers corresponding to each original input
+        return stitchOutput(chunksPerInput, compressedChunkSizes, outputChunkAddrs,
+            outputChunkSizes, stream);
+      }
+    }
+  }
+
+  static void validateChunkSize(long chunkSize) {
+    if (chunkSize <= 0  || chunkSize > MAX_CHUNK_SIZE) {
+      throw new IllegalArgumentException("Invalid chunk size: " + chunkSize +
+          " Max chunk size is: " + MAX_CHUNK_SIZE + " bytes");
+    }
+  }
+
+  private static long ceilingDivide(long x, long y) {
+    return (x + y - 1) / y;
+  }
+
+  private int getNumChunksInBuffer(MemoryBuffer buffer) {
+    return (int) ceilingDivide(buffer.getLength(), chunkSize);
+  }
+
+  private CloseableArray<DeviceMemoryBuffer> allocCompressedBuffers(long numChunks,
+      Cuda.Stream stream) {
+    final long chunksPerBuffer = maxIntermediateBufferSize / maxOutputChunkSize;
+    final long numBuffers = ceilingDivide(numChunks, chunksPerBuffer);
+    if (numBuffers > Integer.MAX_VALUE) {
+      throw new IllegalStateException("Too many chunks");
+    }
+    try (NvtxRange range = new NvtxRange("allocCompressedBuffers", NvtxColor.YELLOW)) {
+      CloseableArray<DeviceMemoryBuffer> buffers = CloseableArray.wrap(
+          new DeviceMemoryBuffer[(int) numBuffers]);
+      try {
+        // allocate all of the max-chunks intermediate compressed buffers
+        for (int i = 0; i < buffers.size() - 1; ++i) {
+          buffers.set(i,
+              DeviceMemoryBuffer.allocate(chunksPerBuffer * maxOutputChunkSize, stream));
+        }
+        // allocate the tail intermediate compressed buffer that may be smaller than the others
+        buffers.set(buffers.size() - 1, DeviceMemoryBuffer.allocate(
+            (numChunks - chunksPerBuffer * (buffers.size() - 1)) * maxOutputChunkSize, stream));
+        return buffers;
+      } catch (Exception e) {
+        buffers.close(e);
+        throw e;
+      }
+    }
+  }
+
+  // Fill in the inputChunkAddrs, inputChunkSizes, and outputChunkAddrs arrays to point
+  // into the chunks in the input and output buffers.
+  private void buildAddrsAndSizes(CloseableArray<BaseDeviceMemoryBuffer> inputs,
+      long[] inputChunkAddrs, long[] inputChunkSizes,
+      CloseableArray<DeviceMemoryBuffer> compressedBuffers, long[] outputChunkAddrs) {
+    // setup the input addresses and sizes
+    int chunkIdx = 0;
+    for (BaseDeviceMemoryBuffer input : inputs.getArray()) {
+      final int numChunksInBuffer = getNumChunksInBuffer(input);
+      for (int i = 0; i < numChunksInBuffer; i++) {
+        inputChunkAddrs[chunkIdx] = input.getAddress() + i * chunkSize;
+        inputChunkSizes[chunkIdx] = (i != numChunksInBuffer - 1) ? chunkSize
+            : (input.getLength() - (long) i * chunkSize);
+        ++chunkIdx;
+      }
+    }
+    assert chunkIdx == inputChunkAddrs.length;
+    assert chunkIdx == inputChunkSizes.length;
+
+    // setup output addresses
+    chunkIdx = 0;
+    for (DeviceMemoryBuffer buffer : compressedBuffers.getArray()) {
+      assert buffer.getLength() % maxOutputChunkSize == 0;
+      long numChunksInBuffer = buffer.getLength() / maxOutputChunkSize;
+      long baseAddr = buffer.getAddress();
+      for (int i = 0; i < numChunksInBuffer; i++) {
+        outputChunkAddrs[chunkIdx++] = baseAddr + i * maxOutputChunkSize;
+      }
+    }
+    assert chunkIdx == outputChunkAddrs.length;
+  }
+
+  // Write input addresses, output addresses and sizes contiguously into a DeviceMemoryBuffer.
+  private DeviceMemoryBuffer putAddrsAndSizesOnDevice(long[] inputAddrs, long[] inputSizes,
+        long[] outputAddrs, Cuda.Stream stream) {
+    final long totalSize = inputAddrs.length * 8L * 3; // space for input, output, and size arrays
+    final long outputAddrsOffset = inputAddrs.length * 8L;
+    final long sizesOffset = outputAddrsOffset + inputAddrs.length * 8L;
+    try (NvtxRange range = new NvtxRange("putAddrsAndSizesOnDevice", NvtxColor.YELLOW)) {
+      try (HostMemoryBuffer hostbuf = HostMemoryBuffer.allocate(totalSize);
+           DeviceMemoryBuffer result = DeviceMemoryBuffer.allocate(totalSize)) {
+        hostbuf.setLongs(0, inputAddrs, 0, inputAddrs.length);
+        hostbuf.setLongs(outputAddrsOffset, outputAddrs, 0, outputAddrs.length);
+        for (int i = 0; i < inputSizes.length; i++) {
+          hostbuf.setLong(sizesOffset + i * 8L, inputSizes[i]);
+        }
+        result.copyFromHostBuffer(hostbuf, stream);
+        result.incRefCount();
+        return result;
+      }
+    }
+  }
+
+  // Synchronously copy the resulting compressed sizes from device memory to host memory.
+  private long[] getOutputChunkSizes(BaseDeviceMemoryBuffer devChunkSizes, Cuda.Stream stream) {
+    try (NvtxRange range = new NvtxRange("getOutputChunkSizes", NvtxColor.YELLOW)) {
+      try (HostMemoryBuffer hostbuf = HostMemoryBuffer.allocate(devChunkSizes.getLength())) {
+        hostbuf.copyFromDeviceBuffer(devChunkSizes, stream);
+        int numChunks = (int) (devChunkSizes.getLength() / 8);
+        long[] result = new long[numChunks];
+        for (int i = 0; i < numChunks; i++) {
+          long size = hostbuf.getLong(i * 8L);
+          assert size < Integer.MAX_VALUE : "output size is too big";
+          result[i] = size;
+        }
+        return result;
+      }
+    }
+  }
+
+  // Stitch together the individual chunks into the result buffers.
+  // Each result buffer has metadata at the beginning, followed by compressed chunks.
+  // This is done by building up parallel lists of source addr, dest addr and size and
+  // then calling multiBufferCopyAsync()
+  private DeviceMemoryBuffer[] stitchOutput(int[] chunksPerInput,
+        DeviceMemoryBuffer compressedChunkSizes, long[] outputChunkAddrs,
+        long[] outputChunkSizes, Cuda.Stream stream) {
+    try (NvtxRange range = new NvtxRange("stitchOutput", NvtxColor.YELLOW)) {
+      final int numOutputs = chunksPerInput.length;
+      final long chunkSizesAddr = compressedChunkSizes.getAddress();
+      long[] outputBufferSizes = calcOutputBufferSizes(chunksPerInput, outputChunkSizes);
+      try (CloseableArray<DeviceMemoryBuffer> outputs =
+              CloseableArray.wrap(new DeviceMemoryBuffer[numOutputs])) {
+        // Each chunk needs to be copied, and each output needs a copy of the
+        // compressed chunk size vector representing the metadata.
+        final int totalBuffersToCopy = numOutputs + outputChunkAddrs.length;
+        long[] destAddrs = new long[totalBuffersToCopy];
+        long[] srcAddrs = new long[totalBuffersToCopy];
+        long[] sizes = new long[totalBuffersToCopy];
+        int copyBufferIdx = 0;
+        int chunkIdx = 0;
+        for (int outputIdx = 0; outputIdx < numOutputs; outputIdx++) {
+          DeviceMemoryBuffer outputBuffer =
+              DeviceMemoryBuffer.allocate(outputBufferSizes[outputIdx]);
+          outputs.set(outputIdx, outputBuffer);
+          final long outputBufferAddr = outputBuffer.getAddress();
+          final long numChunks = chunksPerInput[outputIdx];
+          final long metadataSize = numChunks * METADATA_BYTES_PER_CHUNK;
+
+          // setup a copy of the metadata at the front of the output buffer
+          srcAddrs[copyBufferIdx] = chunkSizesAddr + chunkIdx * 8;
+          destAddrs[copyBufferIdx] = outputBufferAddr;
+          sizes[copyBufferIdx] = metadataSize;
+          ++copyBufferIdx;
+
+          // setup copies of the compressed chunks for this output buffer
+          long nextChunkAddr = outputBufferAddr + metadataSize;
+          for (int i = 0; i < numChunks; ++i) {
+            srcAddrs[copyBufferIdx] = outputChunkAddrs[chunkIdx];
+            destAddrs[copyBufferIdx] = nextChunkAddr;
+            final long chunkSize = outputChunkSizes[chunkIdx];
+            sizes[copyBufferIdx] = chunkSize;
+            copyBufferIdx++;
+            chunkIdx++;
+            nextChunkAddr += chunkSize;
+          }
+        }
+        assert copyBufferIdx == totalBuffersToCopy;
+        assert chunkIdx == outputChunkAddrs.length;
+        assert chunkIdx == outputChunkSizes.length;
+
+        Cuda.multiBufferCopyAsync(destAddrs, srcAddrs, sizes, stream);
+        return outputs.release();
+      }
+    }
+  }
+
+  // Calculate the sizes for each output buffer (metadata plus size of compressed chunks)
+  private long[] calcOutputBufferSizes(int[] chunksPerInput, long[] outputChunkSizes) {
+    long[] sizes = new long[chunksPerInput.length];
+    int chunkIdx = 0;
+    for (int i = 0; i < sizes.length; i++) {
+      final int chunksInBuffer = chunksPerInput[i];
+      final int chunkEndIdx = chunkIdx + chunksInBuffer;
+      // metadata stored in front of compressed data
+      long bufferSize = METADATA_BYTES_PER_CHUNK * chunksInBuffer;
+      // add in the compressed chunk sizes to get the total size
+      while (chunkIdx < chunkEndIdx) {
+        bufferSize += outputChunkSizes[chunkIdx++];
+      }
+      sizes[i] = bufferSize;
+    }
+    assert chunkIdx == outputChunkSizes.length;
+    return sizes;
+  }
+
+  /**
+   * Get the temporary workspace size required to perform compression of an entire batch.
+   * @param batchSize number of chunks in the batch
+   * @param maxChunkSize maximum size of an uncompressed chunk in bytes
+   * @return The size of required temporary workspace in bytes to compress the batch.
+   */
+  protected abstract long batchedCompressGetTempSize(long batchSize, long maxChunkSize);
+
+   /**
+   * Asynchronously compress a batch of buffers. Note that compressedSizesOutPtr must
+   * point to pinned memory for this operation to be asynchronous.
+   * @param devInPtrs device address of uncompressed buffer addresses vector
+   * @param devInSizes device address of uncompressed buffer sizes vector
+   * @param chunkSize maximum size of an uncompressed chunk in bytes
+   * @param batchSize number of chunks in the batch
+   * @param tempPtr device address of the temporary workspace buffer
+   * @param tempSize size of the temporary workspace buffer in bytes
+   * @param devOutPtrs device address of output buffer addresses vector
+   * @param compressedSizesOutPtr device address where to write the sizes of the
+   *                              compressed data written to the corresponding
+   *                              output buffers. Must point to a buffer with
+   *                              at least 8 bytes of memory per output buffer
+   *                              in the batch.
+   * @param stream CUDA stream to use
+   */
+  protected abstract void batchedCompressAsync(long devInPtrs, long devInSizes, long chunkSize,
+      long batchSize, long tempPtr, long tempSize, long devOutPtrs, long compressedSizesOutPtr,
+      long stream);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedDecompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedDecompressor.java
new file mode 100644
index 00000000000..5543d2dcb64
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedDecompressor.java
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf.nvcomp;
+
+import ai.rapids.cudf.CloseableArray;
+import ai.rapids.cudf.Cuda;
+import ai.rapids.cudf.BaseDeviceMemoryBuffer;
+import ai.rapids.cudf.DeviceMemoryBuffer;
+import ai.rapids.cudf.HostMemoryBuffer;
+import ai.rapids.cudf.NvtxColor;
+import ai.rapids.cudf.NvtxRange;
+
+import java.util.Arrays;
+
+/** Decompressor that operates on multiple input buffers in a batch */
+public abstract class BatchedDecompressor {
+
+  private final long chunkSize;
+
+  /**
+   * Construct a batched decompressor instance
+   * @param chunkSize maximum uncompressed block size, must match value used
+   *                  during compression
+   */
+  public BatchedDecompressor(long chunkSize) {
+    this.chunkSize = chunkSize;
+  }
+
+  /**
+   * Asynchronously decompress a batch of buffers
+   * @param origInputs buffers to decompress, will be closed by this operation
+   * @param outputs output buffers that will contain the decompressed results, each must
+   *                be sized to the exact decompressed size of the corresponding input
+   * @param stream CUDA stream to use
+   */
+  public void decompressAsync(BaseDeviceMemoryBuffer[] origInputs,
+      BaseDeviceMemoryBuffer[] outputs, Cuda.Stream stream) {
+    try (CloseableArray<BaseDeviceMemoryBuffer> inputs =
+            CloseableArray.wrap(Arrays.copyOf(origInputs, origInputs.length))) {
+      BatchedCompressor.validateChunkSize(chunkSize);
+      if (origInputs.length != outputs.length) {
+        throw new IllegalArgumentException("number of inputs must match number of outputs");
+      }
+      final int numInputs = inputs.size();
+      if (numInputs == 0) {
+        return;
+      }
+
+      int[] chunksPerInput = new int[numInputs];
+      long totalChunks = 0;
+      for (int i = 0; i < numInputs; i++) {
+        // use output size to determine number of chunks in the input, as the output buffer
+        // must be exactly sized to the uncompressed data
+        BaseDeviceMemoryBuffer buffer = outputs[i];
+        int numBufferChunks = getNumChunksInBuffer(chunkSize, buffer);
+        chunksPerInput[i] = numBufferChunks;
+        totalChunks += numBufferChunks;
+      }
+
+      final long tempBufferSize = batchedDecompressGetTempSize(totalChunks, chunkSize);
+      try (DeviceMemoryBuffer devAddrsSizes = buildAddrsSizesBuffer(chunkSize, totalChunks,
+              inputs.getArray(), chunksPerInput, outputs, stream);
+           DeviceMemoryBuffer devTemp = DeviceMemoryBuffer.allocate(tempBufferSize)) {
+        // buffer containing addresses and sizes contains four vectors of longs in this order:
+        // - compressed chunk input addresses
+        // - chunk output buffer addresses
+        // - compressed chunk sizes
+        // - uncompressed chunk sizes
+        final long inputAddrsPtr = devAddrsSizes.getAddress();
+        final long outputAddrsPtr = inputAddrsPtr + totalChunks * 8;
+        final long inputSizesPtr = outputAddrsPtr + totalChunks * 8;
+        final long outputSizesPtr = inputSizesPtr + totalChunks * 8;
+        batchedDecompressAsync(inputAddrsPtr, inputSizesPtr, outputSizesPtr, totalChunks,
+            devTemp.getAddress(), devTemp.getLength(), outputAddrsPtr, stream.getStream());
+      }
+    }
+  }
+
+  private static int getNumChunksInBuffer(long chunkSize, BaseDeviceMemoryBuffer buffer) {
+    return (int) ((buffer.getLength() + chunkSize - 1) / chunkSize);
+  }
+
+  /**
+   * Build a device memory buffer containing four vectors of longs in the following order:
+   * <ul>
+   *   <li>compressed chunk input addresses</li>
+   *   <li>uncompressed chunk output addresses</li>
+   *   <li>compressed chunk sizes</li>
+   *   <li>uncompressed chunk sizes</li>
+   * </ul>
+   * Each vector contains as many longs as the number of chunks being decompressed
+   * @param chunkSize maximum uncompressed size of a chunk
+   * @param totalChunks total number of chunks to be decompressed
+   * @param inputs device buffers containing the compressed data
+   * @param chunksPerInput number of compressed chunks per input buffer
+   * @param outputs device buffers that will hold the uncompressed output
+   * @param stream CUDA stream to use
+   * @return device buffer containing address and size vectors
+   */
+  private static DeviceMemoryBuffer buildAddrsSizesBuffer(long chunkSize, long totalChunks,
+      BaseDeviceMemoryBuffer[] inputs, int[] chunksPerInput, BaseDeviceMemoryBuffer[] outputs,
+      Cuda.Stream stream) {
+    final long totalBufferSize = totalChunks * 8L * 4L;
+    try (NvtxRange range = new NvtxRange("buildAddrSizesBuffer", NvtxColor.YELLOW)) {
+      try (HostMemoryBuffer metadata = fetchMetadata(totalChunks, inputs, chunksPerInput, stream);
+           HostMemoryBuffer hostAddrsSizes = HostMemoryBuffer.allocate(totalBufferSize);
+           DeviceMemoryBuffer devAddrsSizes = DeviceMemoryBuffer.allocate(totalBufferSize)) {
+        // Build four long vectors in the AddrsSizes buffer:
+        // - compressed input address (one per chunk)
+        // - uncompressed output address (one per chunk)
+        // - compressed input size (one per chunk)
+        // - uncompressed input size (one per chunk)
+        final long srcAddrsOffset = 0;
+        final long destAddrsOffset = srcAddrsOffset + totalChunks * 8L;
+        final long srcSizesOffset = destAddrsOffset + totalChunks * 8L;
+        final long destSizesOffset = srcSizesOffset + totalChunks * 8L;
+        long chunkIdx = 0;
+        for (int inputIdx = 0; inputIdx < inputs.length; inputIdx++) {
+          final BaseDeviceMemoryBuffer input = inputs[inputIdx];
+          final BaseDeviceMemoryBuffer output = outputs[inputIdx];
+          final int numChunksInInput = chunksPerInput[inputIdx];
+          long srcAddr = input.getAddress() +
+              BatchedCompressor.METADATA_BYTES_PER_CHUNK * numChunksInInput;
+          long destAddr = output.getAddress();
+          final long chunkIdxEnd = chunkIdx + numChunksInInput;
+          while (chunkIdx < chunkIdxEnd) {
+            final long srcChunkSize = metadata.getLong(chunkIdx * 8);
+            final long destChunkSize = (chunkIdx < chunkIdxEnd - 1) ? chunkSize
+                : output.getAddress() + output.getLength() - destAddr;
+            hostAddrsSizes.setLong(srcAddrsOffset + chunkIdx * 8, srcAddr);
+            hostAddrsSizes.setLong(destAddrsOffset + chunkIdx * 8, destAddr);
+            hostAddrsSizes.setLong(srcSizesOffset + chunkIdx * 8, srcChunkSize);
+            hostAddrsSizes.setLong(destSizesOffset + chunkIdx * 8, destChunkSize);
+            srcAddr += srcChunkSize;
+            destAddr += destChunkSize;
+            ++chunkIdx;
+          }
+        }
+        devAddrsSizes.copyFromHostBuffer(hostAddrsSizes, stream);
+        devAddrsSizes.incRefCount();
+        return devAddrsSizes;
+      }
+    }
+  }
+
+  /**
+   * Fetch the metadata at the front of each input in a single, contiguous host buffer.
+   * @param totalChunks total number of compressed chunks
+   * @param inputs buffers containing the compressed data
+   * @param chunksPerInput number of compressed chunks for the corresponding input
+   * @param stream CUDA stream to use
+   * @return host buffer containing all of the metadata
+   */
+  private static HostMemoryBuffer fetchMetadata(long totalChunks, BaseDeviceMemoryBuffer[] inputs,
+      int[] chunksPerInput, Cuda.Stream stream) {
+    try (NvtxRange range = new NvtxRange("fetchMetadata", NvtxColor.PURPLE)) {
+      // one long per chunk containing the compressed size
+      final long totalMetadataSize = totalChunks * BatchedCompressor.METADATA_BYTES_PER_CHUNK;
+      // Build corresponding vectors of destination addresses, source addresses and sizes.
+      long[] destAddrs = new long[inputs.length];
+      long[] srcAddrs = new long[inputs.length];
+      long[] sizes = new long[inputs.length];
+      try (HostMemoryBuffer hostMetadata = HostMemoryBuffer.allocate(totalMetadataSize);
+           DeviceMemoryBuffer devMetadata = DeviceMemoryBuffer.allocate(totalMetadataSize)) {
+        long destCopyAddr = devMetadata.getAddress();
+        for (int inputIdx = 0; inputIdx < inputs.length; inputIdx++) {
+          final BaseDeviceMemoryBuffer input = inputs[inputIdx];
+          final long copySize =
+              chunksPerInput[inputIdx] * BatchedCompressor.METADATA_BYTES_PER_CHUNK;
+          destAddrs[inputIdx] = destCopyAddr;
+          srcAddrs[inputIdx] = input.getAddress();
+          sizes[inputIdx] = copySize;
+          destCopyAddr += copySize;
+        }
+        Cuda.multiBufferCopyAsync(destAddrs, srcAddrs, sizes, stream);
+        hostMetadata.copyFromDeviceBuffer(devMetadata, stream);
+        hostMetadata.incRefCount();
+        return hostMetadata;
+      }
+    }
+  }
+
+  /**
+   * Computes the temporary storage size in bytes needed to decompress a compressed batch.
+   * @param numChunks number of chunks in the batch
+   * @param maxUncompressedChunkBytes maximum uncompressed size of any chunk in bytes
+   * @return number of temporary storage bytes needed to decompress the batch
+   */
+  protected abstract long batchedDecompressGetTempSize(long numChunks,
+      long maxUncompressedChunkBytes);
+
+    /**
+   * Asynchronously decompress a batch of compressed data buffers.
+   * @param devInPtrs device address of compressed input buffer addresses vector
+   * @param devInSizes device address of compressed input buffer sizes vector
+   * @param devOutSizes device address of uncompressed buffer sizes vector
+   * @param batchSize number of buffers in the batch
+   * @param tempPtr device address of the temporary decompression space
+   * @param tempSize size of the temporary decompression space in bytes
+   * @param devOutPtrs device address of uncompressed output buffer addresses vector
+   * @param stream CUDA stream to use
+   */
+  protected abstract void batchedDecompressAsync(long devInPtrs, long devInSizes,
+      long devOutSizes, long batchSize, long tempPtr, long tempSize, long devOutPtrs,
+      long stream);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java
index 1aa7e5e11a0..58c0e7ee169 100644
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,305 +16,31 @@
 
 package ai.rapids.cudf.nvcomp;
 
-import ai.rapids.cudf.BaseDeviceMemoryBuffer;
-import ai.rapids.cudf.CloseableArray;
-import ai.rapids.cudf.Cuda;
-import ai.rapids.cudf.DefaultHostMemoryAllocator;
-import ai.rapids.cudf.DeviceMemoryBuffer;
-import ai.rapids.cudf.HostMemoryAllocator;
-import ai.rapids.cudf.HostMemoryBuffer;
-import ai.rapids.cudf.MemoryBuffer;
-import ai.rapids.cudf.NvtxColor;
-import ai.rapids.cudf.NvtxRange;
-
 /** Multi-buffer LZ4 compressor */
-public class BatchedLZ4Compressor {
-  private static final HostMemoryAllocator hostMemoryAllocator = DefaultHostMemoryAllocator.get();
-
-  static final long MAX_CHUNK_SIZE = 16777216;  // in bytes
-  // each chunk has a 64-bit integer value as metadata containing the compressed size
-  static final long METADATA_BYTES_PER_CHUNK = 8;
-
-  private final long chunkSize;
-  private final long targetIntermediateBufferSize;
-  private final long maxOutputChunkSize;
+public class BatchedLZ4Compressor extends BatchedCompressor {
 
   /**
    * Construct a batched LZ4 compressor instance
-   * @param chunkSize maximum amount of uncompressed data to compress as a single chunk. Inputs
-   *                  larger than this will be compressed in multiple chunks.
-   * @param targetIntermediateBufferSize desired maximum size of intermediate device buffers
-   *                                     used during compression.
-   */
-  public BatchedLZ4Compressor(long chunkSize, long targetIntermediateBufferSize) {
-    validateChunkSize(chunkSize);
-    this.chunkSize = chunkSize;
-    this.maxOutputChunkSize = NvcompJni.batchedLZ4CompressGetMaxOutputChunkSize(chunkSize);
-    assert maxOutputChunkSize < Integer.MAX_VALUE;
-    this.targetIntermediateBufferSize = Math.max(targetIntermediateBufferSize, maxOutputChunkSize);
-  }
-
-  /**
-   * Compress a batch of buffers with LZ4. The input buffers will be closed.
-   * @param origInputs buffers to compress
-   * @param stream CUDA stream to use
-   * @return compressed buffers corresponding to the input buffers
+   * @param chunkSize maximum amount of uncompressed data to compress as a single chunk.
+   *                  Inputs larger than this will be compressed in multiple chunks.
+   * @param maxIntermediateBufferSize desired maximum size of intermediate device buffers
+   *                                  used during compression.
    */
-  public DeviceMemoryBuffer[] compress(BaseDeviceMemoryBuffer[] origInputs, Cuda.Stream stream) {
-    try (CloseableArray<BaseDeviceMemoryBuffer> inputs = CloseableArray.wrap(origInputs)) {
-      if (chunkSize <= 0) {
-        throw new IllegalArgumentException("Illegal chunk size: " + chunkSize);
-      }
-      final int numInputs = inputs.size();
-      if (numInputs == 0) {
-        return new DeviceMemoryBuffer[0];
-      }
-
-      // Each buffer is broken up into chunkSize chunks for compression.  Calculate how many
-      // chunks are needed for each input buffer.
-      int[] chunksPerInput = new int[numInputs];
-      int numChunks = 0;
-      for (int i = 0; i < numInputs; i++) {
-        BaseDeviceMemoryBuffer buffer = inputs.get(i);
-        int numBufferChunks = getNumChunksInBuffer(buffer);
-        chunksPerInput[i] = numBufferChunks;
-        numChunks += numBufferChunks;
-      }
-
-      // Allocate buffers for each chunk and generate parallel lists of chunk source addresses,
-      // chunk destination addresses, and sizes.
-      try (CloseableArray<DeviceMemoryBuffer> compressedBuffers =
-               allocCompressedBuffers(numChunks, stream);
-           DeviceMemoryBuffer compressedChunkSizes =
-               DeviceMemoryBuffer.allocate(numChunks * 8L, stream)) {
-        long[] inputChunkAddrs = new long[numChunks];
-        long[] inputChunkSizes = new long[numChunks];
-        long[] outputChunkAddrs = new long[numChunks];
-        buildAddrsAndSizes(inputs, inputChunkAddrs, inputChunkSizes,
-            compressedBuffers, outputChunkAddrs);
-
-        long[] outputChunkSizes;
-        final long tempBufferSize = NvcompJni.batchedLZ4CompressGetTempSize(numChunks, chunkSize);
-        try (DeviceMemoryBuffer addrsAndSizes =
-                 putAddrsAndSizesOnDevice(inputChunkAddrs, inputChunkSizes, outputChunkAddrs, stream);
-             DeviceMemoryBuffer tempBuffer = DeviceMemoryBuffer.allocate(tempBufferSize, stream)) {
-          final long devOutputAddrsPtr = addrsAndSizes.getAddress() + numChunks * 8L;
-          final long devInputSizesPtr = devOutputAddrsPtr + numChunks * 8L;
-          NvcompJni.batchedLZ4CompressAsync(
-              addrsAndSizes.getAddress(),
-              devInputSizesPtr,
-              chunkSize,
-              numChunks,
-              tempBuffer.getAddress(),
-              tempBufferSize,
-              devOutputAddrsPtr,
-              compressedChunkSizes.getAddress(),
-              stream.getStream());
-        }
-
-        // Synchronously copy the resulting compressed sizes per chunk.
-        outputChunkSizes = getOutputChunkSizes(compressedChunkSizes, stream);
-
-        // inputs are no longer needed at this point, so free them early
-        inputs.close();
-
-        // Combine compressed chunks into output buffers corresponding to each original input
-        return stitchOutput(chunksPerInput, compressedChunkSizes, outputChunkAddrs,
-            outputChunkSizes, stream);
-      }
-    }
-  }
-
-  static void validateChunkSize(long chunkSize) {
-    if (chunkSize <= 0  || chunkSize > MAX_CHUNK_SIZE) {
-      throw new IllegalArgumentException("Invalid chunk size: " + chunkSize + " Max chunk size is: "
-          + MAX_CHUNK_SIZE + " bytes");
-    }
+  public BatchedLZ4Compressor(long chunkSize, long maxIntermediateBufferSize) {
+    super(chunkSize, NvcompJni.batchedLZ4CompressGetMaxOutputChunkSize(chunkSize),
+        maxIntermediateBufferSize);
   }
 
-  private static long ceilingDivide(long x, long y) {
-    return (x + y - 1) / y;
-  }
-
-  private int getNumChunksInBuffer(MemoryBuffer buffer) {
-    return (int) ceilingDivide(buffer.getLength(), chunkSize);
-  }
-
-  private CloseableArray<DeviceMemoryBuffer> allocCompressedBuffers(long numChunks,
-                                                                    Cuda.Stream stream) {
-    final long chunksPerBuffer = targetIntermediateBufferSize / maxOutputChunkSize;
-    final long numBuffers = ceilingDivide(numChunks, chunksPerBuffer);
-    if (numBuffers > Integer.MAX_VALUE) {
-      throw new IllegalStateException("Too many chunks");
-    }
-    try (NvtxRange range = new NvtxRange("allocCompressedBuffers", NvtxColor.YELLOW)) {
-      CloseableArray<DeviceMemoryBuffer> buffers = CloseableArray.wrap(
-          new DeviceMemoryBuffer[(int) numBuffers]);
-      try {
-        // allocate all of the max-chunks intermediate compressed buffers
-        for (int i = 0; i < buffers.size() - 1; ++i) {
-          buffers.set(i, DeviceMemoryBuffer.allocate(chunksPerBuffer * maxOutputChunkSize, stream));
-        }
-        // allocate the tail intermediate compressed buffer that may be smaller than the others
-        buffers.set(buffers.size() - 1, DeviceMemoryBuffer.allocate(
-            (numChunks - chunksPerBuffer * (buffers.size() - 1)) * maxOutputChunkSize, stream));
-        return buffers;
-      } catch (Exception e) {
-        buffers.close(e);
-        throw e;
-      }
-    }
-  }
-
-  // Fill in the inputChunkAddrs, inputChunkSizes, and outputChunkAddrs arrays to point
-  // into the chunks in the input and output buffers.
-  private void buildAddrsAndSizes(CloseableArray<BaseDeviceMemoryBuffer> inputs,
-                                  long[] inputChunkAddrs,
-                                  long[] inputChunkSizes,
-                                  CloseableArray<DeviceMemoryBuffer> compressedBuffers,
-                                  long[] outputChunkAddrs) {
-    // setup the input addresses and sizes
-    int chunkIdx = 0;
-    for (BaseDeviceMemoryBuffer input : inputs.getArray()) {
-      final int numChunksInBuffer = getNumChunksInBuffer(input);
-      for (int i = 0; i < numChunksInBuffer; i++) {
-        inputChunkAddrs[chunkIdx] = input.getAddress() + i * chunkSize;
-        inputChunkSizes[chunkIdx] = (i != numChunksInBuffer - 1) ? chunkSize
-            : (input.getLength() - (long) i * chunkSize);
-        ++chunkIdx;
-      }
-    }
-    assert chunkIdx == inputChunkAddrs.length;
-    assert chunkIdx == inputChunkSizes.length;
-
-    // setup output addresses
-    chunkIdx = 0;
-    for (DeviceMemoryBuffer buffer : compressedBuffers.getArray()) {
-      assert buffer.getLength() % maxOutputChunkSize == 0;
-      long numChunksInBuffer = buffer.getLength() / maxOutputChunkSize;
-      long baseAddr = buffer.getAddress();
-      for (int i = 0; i < numChunksInBuffer; i++) {
-        outputChunkAddrs[chunkIdx++] = baseAddr + i * maxOutputChunkSize;
-      }
-    }
-    assert chunkIdx == outputChunkAddrs.length;
-  }
-
-  // Write input addresses, output addresses and sizes contiguously into a DeviceMemoryBuffer.
-  private DeviceMemoryBuffer putAddrsAndSizesOnDevice(long[] inputAddrs,
-                                                      long[] inputSizes,
-                                                      long[] outputAddrs,
-                                                      Cuda.Stream stream) {
-    final long totalSize = inputAddrs.length * 8L * 3; // space for input, output, and size arrays
-    final long outputAddrsOffset = inputAddrs.length * 8L;
-    final long sizesOffset = outputAddrsOffset + inputAddrs.length * 8L;
-    try (NvtxRange range = new NvtxRange("putAddrsAndSizesOnDevice", NvtxColor.YELLOW)) {
-      try (HostMemoryBuffer hostbuf = hostMemoryAllocator.allocate(totalSize);
-           DeviceMemoryBuffer result = DeviceMemoryBuffer.allocate(totalSize)) {
-        hostbuf.setLongs(0, inputAddrs, 0, inputAddrs.length);
-        hostbuf.setLongs(outputAddrsOffset, outputAddrs, 0, outputAddrs.length);
-        for (int i = 0; i < inputSizes.length; i++) {
-          hostbuf.setLong(sizesOffset + i * 8L, inputSizes[i]);
-        }
-        result.copyFromHostBuffer(hostbuf, stream);
-        result.incRefCount();
-        return result;
-      }
-    }
-  }
-
-  // Synchronously copy the resulting compressed sizes from device memory to host memory.
-  private long[] getOutputChunkSizes(BaseDeviceMemoryBuffer devChunkSizes, Cuda.Stream stream) {
-    try (NvtxRange range = new NvtxRange("getOutputChunkSizes", NvtxColor.YELLOW)) {
-      try (HostMemoryBuffer hostbuf = hostMemoryAllocator.allocate(devChunkSizes.getLength())) {
-        hostbuf.copyFromDeviceBuffer(devChunkSizes, stream);
-        int numChunks = (int) (devChunkSizes.getLength() / 8);
-        long[] result = new long[numChunks];
-        for (int i = 0; i < numChunks; i++) {
-          long size = hostbuf.getLong(i * 8L);
-          assert size < Integer.MAX_VALUE : "output size is too big";
-          result[i] = size;
-        }
-        return result;
-      }
-    }
-  }
-
-  // Stitch together the individual chunks into the result buffers.
-  // Each result buffer has metadata at the beginning, followed by compressed chunks.
-  // This is done by building up parallel lists of source addr, dest addr and size and
-  // then calling multiBufferCopyAsync()
-  private DeviceMemoryBuffer[] stitchOutput(int[] chunksPerInput,
-                                            DeviceMemoryBuffer compressedChunkSizes,
-                                            long[] outputChunkAddrs,
-                                            long[] outputChunkSizes,
-                                            Cuda.Stream stream) {
-    try (NvtxRange range = new NvtxRange("stitchOutput", NvtxColor.YELLOW)) {
-      final int numOutputs = chunksPerInput.length;
-      final long chunkSizesAddr = compressedChunkSizes.getAddress();
-      long[] outputBufferSizes = calcOutputBufferSizes(chunksPerInput, outputChunkSizes);
-      try (CloseableArray<DeviceMemoryBuffer> outputs =
-               CloseableArray.wrap(new DeviceMemoryBuffer[numOutputs])) {
-        // Each chunk needs to be copied, and each output needs a copy of the
-        // compressed chunk size vector representing the metadata.
-        final int totalBuffersToCopy = numOutputs + outputChunkAddrs.length;
-        long[] destAddrs = new long[totalBuffersToCopy];
-        long[] srcAddrs = new long[totalBuffersToCopy];
-        long[] sizes = new long[totalBuffersToCopy];
-        int copyBufferIdx = 0;
-        int chunkIdx = 0;
-        for (int outputIdx = 0; outputIdx < numOutputs; outputIdx++) {
-          DeviceMemoryBuffer outputBuffer = DeviceMemoryBuffer.allocate(outputBufferSizes[outputIdx]);
-          final long outputBufferAddr = outputBuffer.getAddress();
-          outputs.set(outputIdx, outputBuffer);
-          final long numChunks = chunksPerInput[outputIdx];
-          final long metadataSize = numChunks * METADATA_BYTES_PER_CHUNK;
-
-          // setup a copy of the metadata at the front of the output buffer
-          srcAddrs[copyBufferIdx] = chunkSizesAddr + chunkIdx * 8;
-          destAddrs[copyBufferIdx] = outputBufferAddr;
-          sizes[copyBufferIdx] = metadataSize;
-          ++copyBufferIdx;
-
-          // setup copies of the compressed chunks for this output buffer
-          long nextChunkAddr = outputBufferAddr + metadataSize;
-          for (int i = 0; i < numChunks; ++i) {
-            srcAddrs[copyBufferIdx] = outputChunkAddrs[chunkIdx];
-            destAddrs[copyBufferIdx] = nextChunkAddr;
-            final long chunkSize = outputChunkSizes[chunkIdx];
-            sizes[copyBufferIdx] = chunkSize;
-            copyBufferIdx++;
-            chunkIdx++;
-            nextChunkAddr += chunkSize;
-          }
-        }
-        assert copyBufferIdx == totalBuffersToCopy;
-        assert chunkIdx == outputChunkAddrs.length;
-        assert chunkIdx == outputChunkSizes.length;
-
-        Cuda.multiBufferCopyAsync(destAddrs, srcAddrs, sizes, stream);
-        return outputs.release();
-      }
-    }
+  @Override
+  protected long batchedCompressGetTempSize(long batchSize, long maxChunkSize) {
+    return NvcompJni.batchedLZ4CompressGetTempSize(batchSize, maxChunkSize);
   }
 
-  // Calculate the list of sizes for each output buffer (metadata plus size of compressed chunks)
-  private long[] calcOutputBufferSizes(int[] chunksPerInput,
-                                       long[] outputChunkSizes) {
-    long[] sizes = new long[chunksPerInput.length];
-    int chunkIdx = 0;
-    for (int i = 0; i < sizes.length; i++) {
-      final int chunksInBuffer = chunksPerInput[i];
-      final int chunkEndIdx = chunkIdx + chunksInBuffer;
-      // metadata stored in front of compressed data
-      long bufferSize = METADATA_BYTES_PER_CHUNK * chunksInBuffer;
-      // add in the compressed chunk sizes to get the total size
-      while (chunkIdx < chunkEndIdx) {
-        bufferSize += outputChunkSizes[chunkIdx++];
-      }
-      sizes[i] = bufferSize;
-    }
-    assert chunkIdx == outputChunkSizes.length;
-    return sizes;
+  @Override
+  protected void batchedCompressAsync(long devInPtrs, long devInSizes, long chunkSize,
+      long batchSize, long tempPtr, long tempSize, long devOutPtrs,
+      long compressedSizesOutPtr, long stream) {
+    NvcompJni.batchedLZ4CompressAsync(devInPtrs, devInSizes, chunkSize, batchSize,
+        tempPtr, tempSize, devOutPtrs, compressedSizesOutPtr, stream);
   }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java
index 40ad4d5e9ed..d78d537ea13 100644
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,18 +16,15 @@
 
 package ai.rapids.cudf.nvcomp;
 
-import ai.rapids.cudf.CloseableArray;
 import ai.rapids.cudf.Cuda;
 import ai.rapids.cudf.BaseDeviceMemoryBuffer;
-import ai.rapids.cudf.DeviceMemoryBuffer;
-import ai.rapids.cudf.HostMemoryBuffer;
-import ai.rapids.cudf.NvtxColor;
-import ai.rapids.cudf.NvtxRange;
-
-import java.util.Arrays;
 
 /** LZ4 decompressor that operates on multiple input buffers in a batch */
-public class BatchedLZ4Decompressor {
+public class BatchedLZ4Decompressor extends BatchedDecompressor {
+  public BatchedLZ4Decompressor(long chunkSize) {
+    super(chunkSize);
+  }
+
   /**
    * Asynchronously decompress a batch of buffers
    * @param chunkSize maximum uncompressed block size, must match value used during compression
@@ -35,165 +32,24 @@ public class BatchedLZ4Decompressor {
    * @param outputs output buffers that will contain the compressed results, each must be sized
    *                to the exact decompressed size of the corresponding input
    * @param stream CUDA stream to use
+   *
+   * Deprecated: Use the non-static version in the parent class instead.
    */
-  public static void decompressAsync(long chunkSize,
-                                     BaseDeviceMemoryBuffer[] origInputs,
-                                     BaseDeviceMemoryBuffer[] outputs,
-                                     Cuda.Stream stream) {
-    try (CloseableArray<BaseDeviceMemoryBuffer> inputs =
-             CloseableArray.wrap(Arrays.copyOf(origInputs, origInputs.length))) {
-      BatchedLZ4Compressor.validateChunkSize(chunkSize);
-      if (origInputs.length != outputs.length) {
-        throw new IllegalArgumentException("number of inputs must match number of outputs");
-      }
-      final int numInputs = inputs.size();
-      if (numInputs == 0) {
-        return;
-      }
-
-      int[] chunksPerInput = new int[numInputs];
-      long totalChunks = 0;
-      for (int i = 0; i < numInputs; i++) {
-        // use output size to determine number of chunks in the input, as the output buffer
-        // must be exactly sized to the uncompressed data
-        BaseDeviceMemoryBuffer buffer = outputs[i];
-        int numBufferChunks = getNumChunksInBuffer(chunkSize, buffer);
-        chunksPerInput[i] = numBufferChunks;
-        totalChunks += numBufferChunks;
-      }
-
-      final long tempBufferSize = NvcompJni.batchedLZ4DecompressGetTempSize(totalChunks, chunkSize);
-      try (DeviceMemoryBuffer devAddrsSizes =
-               buildAddrsSizesBuffer(chunkSize, totalChunks, inputs.getArray(), chunksPerInput,
-                   outputs, stream);
-           DeviceMemoryBuffer devTemp = DeviceMemoryBuffer.allocate(tempBufferSize)) {
-        // buffer containing addresses and sizes contains four vectors of longs in this order:
-        // - compressed chunk input addresses
-        // - chunk output buffer addresses
-        // - compressed chunk sizes
-        // - uncompressed chunk sizes
-        final long inputAddrsPtr = devAddrsSizes.getAddress();
-        final long outputAddrsPtr = inputAddrsPtr + totalChunks * 8;
-        final long inputSizesPtr = outputAddrsPtr + totalChunks * 8;
-        final long outputSizesPtr = inputSizesPtr + totalChunks * 8;
-        NvcompJni.batchedLZ4DecompressAsync(
-            inputAddrsPtr,
-            inputSizesPtr,
-            outputSizesPtr,
-            totalChunks,
-            devTemp.getAddress(),
-            devTemp.getLength(),
-            outputAddrsPtr,
-            stream.getStream());
-      }
-    }
+  public static void decompressAsync(long chunkSize, BaseDeviceMemoryBuffer[] origInputs,
+      BaseDeviceMemoryBuffer[] outputs, Cuda.Stream stream) {
+    new BatchedLZ4Decompressor(chunkSize).decompressAsync(origInputs, outputs, stream);
   }
 
-  private static int getNumChunksInBuffer(long chunkSize, BaseDeviceMemoryBuffer buffer) {
-    return (int) ((buffer.getLength() + chunkSize - 1) / chunkSize);
+  @Override
+  protected long batchedDecompressGetTempSize(long numChunks, long maxUncompressedChunkBytes) {
+    return NvcompJni.batchedLZ4DecompressGetTempSize(numChunks, maxUncompressedChunkBytes);
   }
 
-  /**
-   * Build a device memory buffer containing four vectors of longs in the following order:
-   * <ul>
-   *   <li>compressed chunk input addresses</li>
-   *   <li>uncompressed chunk output addresses</li>
-   *   <li>compressed chunk sizes</li>
-   *   <li>uncompressed chunk sizes</li>
-   * </ul>
-   * Each vector contains as many longs as the number of chunks being decompressed
-   * @param chunkSize maximum uncompressed size of a chunk
-   * @param totalChunks total number of chunks to be decompressed
-   * @param inputs device buffers containing the compressed data
-   * @param chunksPerInput number of compressed chunks per input buffer
-   * @param outputs device buffers that will hold the uncompressed output
-   * @param stream CUDA stream to use
-   * @return device buffer containing address and size vectors
-   */
-  private static DeviceMemoryBuffer buildAddrsSizesBuffer(long chunkSize,
-                                                          long totalChunks,
-                                                          BaseDeviceMemoryBuffer[] inputs,
-                                                          int[] chunksPerInput,
-                                                          BaseDeviceMemoryBuffer[] outputs,
-                                                          Cuda.Stream stream) {
-    final long totalBufferSize = totalChunks * 8L * 4L;
-    try (NvtxRange range = new NvtxRange("buildAddrSizesBuffer", NvtxColor.YELLOW)) {
-      try (HostMemoryBuffer metadata = fetchMetadata(totalChunks, inputs, chunksPerInput, stream);
-           HostMemoryBuffer hostAddrsSizes = HostMemoryBuffer.allocate(totalBufferSize);
-           DeviceMemoryBuffer devAddrsSizes = DeviceMemoryBuffer.allocate(totalBufferSize)) {
-        // Build four long vectors in the AddrsSizes buffer:
-        // - compressed input address (one per chunk)
-        // - uncompressed output address (one per chunk)
-        // - compressed input size (one per chunk)
-        // - uncompressed input size (one per chunk)
-        final long srcAddrsOffset = 0;
-        final long destAddrsOffset = srcAddrsOffset + totalChunks * 8L;
-        final long srcSizesOffset = destAddrsOffset + totalChunks * 8L;
-        final long destSizesOffset = srcSizesOffset + totalChunks * 8L;
-        long chunkIdx = 0;
-        for (int inputIdx = 0; inputIdx < inputs.length; inputIdx++) {
-          final BaseDeviceMemoryBuffer input = inputs[inputIdx];
-          final BaseDeviceMemoryBuffer output = outputs[inputIdx];
-          final int numChunksInInput = chunksPerInput[inputIdx];
-          long srcAddr = input.getAddress() +
-              BatchedLZ4Compressor.METADATA_BYTES_PER_CHUNK * numChunksInInput;
-          long destAddr = output.getAddress();
-          final long chunkIdxEnd = chunkIdx + numChunksInInput;
-          while (chunkIdx < chunkIdxEnd) {
-            final long srcChunkSize = metadata.getLong(chunkIdx * 8);
-            final long destChunkSize = (chunkIdx < chunkIdxEnd - 1) ? chunkSize
-                : output.getAddress() + output.getLength() - destAddr;
-            hostAddrsSizes.setLong(srcAddrsOffset + chunkIdx * 8, srcAddr);
-            hostAddrsSizes.setLong(destAddrsOffset + chunkIdx * 8, destAddr);
-            hostAddrsSizes.setLong(srcSizesOffset + chunkIdx * 8, srcChunkSize);
-            hostAddrsSizes.setLong(destSizesOffset + chunkIdx * 8, destChunkSize);
-            srcAddr += srcChunkSize;
-            destAddr += destChunkSize;
-            ++chunkIdx;
-          }
-        }
-        devAddrsSizes.copyFromHostBuffer(hostAddrsSizes, stream);
-        devAddrsSizes.incRefCount();
-        return devAddrsSizes;
-      }
-    }
+  @Override
+  protected void batchedDecompressAsync(long devInPtrs, long devInSizes, long devOutSizes,
+      long batchSize, long tempPtr, long tempSize, long devOutPtrs, long stream) {
+    NvcompJni.batchedLZ4DecompressAsync(devInPtrs, devInSizes, devOutSizes, batchSize, tempPtr,
+        tempSize, devOutPtrs, stream);
   }
 
-  /**
-   * Fetch the metadata at the front of each input in a single, contiguous host buffer.
-   * @param totalChunks total number of compressed chunks
-   * @param inputs buffers containing the compressed data
-   * @param chunksPerInput number of compressed chunks for the corresponding input
-   * @param stream CUDA stream to use
-   * @return host buffer containing all of the metadata
-   */
-  private static HostMemoryBuffer fetchMetadata(long totalChunks,
-                                                BaseDeviceMemoryBuffer[] inputs,
-                                                int[] chunksPerInput,
-                                                Cuda.Stream stream) {
-    try (NvtxRange range = new NvtxRange("fetchMetadata", NvtxColor.PURPLE)) {
-      // one long per chunk containing the compressed size
-      final long totalMetadataSize = totalChunks * BatchedLZ4Compressor.METADATA_BYTES_PER_CHUNK;
-      // Build corresponding vectors of destination addresses, source addresses and sizes.
-      long[] destAddrs = new long[inputs.length];
-      long[] srcAddrs = new long[inputs.length];
-      long[] sizes = new long[inputs.length];
-      try (HostMemoryBuffer hostMetadata = HostMemoryBuffer.allocate(totalMetadataSize);
-           DeviceMemoryBuffer devMetadata = DeviceMemoryBuffer.allocate(totalMetadataSize)) {
-        long destCopyAddr = devMetadata.getAddress();
-        for (int inputIdx = 0; inputIdx < inputs.length; inputIdx++) {
-          final BaseDeviceMemoryBuffer input = inputs[inputIdx];
-          final long copySize = chunksPerInput[inputIdx] * BatchedLZ4Compressor.METADATA_BYTES_PER_CHUNK;
-          destAddrs[inputIdx] = destCopyAddr;
-          srcAddrs[inputIdx] = input.getAddress();
-          sizes[inputIdx] = copySize;
-          destCopyAddr += copySize;
-        }
-        Cuda.multiBufferCopyAsync(destAddrs, srcAddrs, sizes, stream);
-        hostMetadata.copyFromDeviceBuffer(devMetadata, stream);
-        hostMetadata.incRefCount();
-        return hostMetadata;
-      }
-    }
-  }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdCompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdCompressor.java
new file mode 100644
index 00000000000..0532b4aa86d
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdCompressor.java
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf.nvcomp;
+
+/** Multi-buffer ZSTD compressor */
+public class BatchedZstdCompressor extends BatchedCompressor {
+  /**
+   * Construct a batched ZSTD compressor instance
+   * @param chunkSize maximum amount of uncompressed data to compress as a single chunk.
+   *                  Inputs larger than this will be compressed in multiple chunks.
+   * @param maxIntermediateBufferSize desired maximum size of intermediate device buffers
+   *                                  used during compression.
+   */
+  public BatchedZstdCompressor(long chunkSize, long maxIntermediateBufferSize) {
+    super(chunkSize, NvcompJni.batchedZstdCompressGetMaxOutputChunkSize(chunkSize),
+        maxIntermediateBufferSize);
+  }
+
+  @Override
+  protected long batchedCompressGetTempSize(long batchSize, long maxChunkSize) {
+    return NvcompJni.batchedZstdCompressGetTempSize(batchSize, maxChunkSize);
+  }
+
+  @Override
+  protected void batchedCompressAsync(long devInPtrs, long devInSizes, long chunkSize,
+      long batchSize, long tempPtr, long tempSize, long devOutPtrs,
+      long compressedSizesOutPtr, long stream) {
+    NvcompJni.batchedZstdCompressAsync(devInPtrs, devInSizes, chunkSize, batchSize,
+        tempPtr, tempSize, devOutPtrs, compressedSizesOutPtr, stream);
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdDecompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdDecompressor.java
new file mode 100644
index 00000000000..ba11a236834
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdDecompressor.java
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf.nvcomp;
+
+/** ZSTD decompressor that operates on multiple input buffers in a batch */
+public class BatchedZstdDecompressor extends BatchedDecompressor {
+  public BatchedZstdDecompressor(long chunkSize) {
+    super(chunkSize);
+  }
+
+  @Override
+  protected long batchedDecompressGetTempSize(long numChunks, long maxUncompressedChunkBytes) {
+    return NvcompJni.batchedZstdDecompressGetTempSize(numChunks, maxUncompressedChunkBytes);
+  }
+
+  @Override
+  protected void batchedDecompressAsync(long devInPtrs, long devInSizes, long devOutSizes,
+      long batchSize, long tempPtr, long tempSize, long devOutPtrs, long stream) {
+    NvcompJni.batchedZstdDecompressAsync(devInPtrs, devInSizes, devOutSizes, batchSize, tempPtr,
+        tempSize, devOutPtrs, stream);
+  }
+
+}
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java b/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java
index 57094008c08..1a21629a208 100644
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@ class NvcompJni {
     NativeDepsLoader.loadNativeDeps();
   }
 
+  // For lz4
   /**
    * Get the temporary workspace size required to perform compression of entire LZ4 batch.
    * @param batchSize number of chunks in the batch
@@ -114,4 +115,97 @@ static native void batchedLZ4GetDecompressSizeAsync(
       long devOutSizes,
       long batchSize,
       long stream);
+
+  // For zstd
+  /**
+   * Get the temporary workspace size required to perform compression of entire zstd batch.
+   * @param batchSize number of chunks in the batch
+   * @param maxChunkSize maximum size of an uncompressed chunk in bytes
+   * @return The size of required temporary workspace in bytes to compress the batch.
+   */
+  static native long batchedZstdCompressGetTempSize(long batchSize, long maxChunkSize);
+
+  /**
+   * Get the maximum size any chunk could compress to in a ZSTD batch. This is the minimum
+   * amount of output memory to allocate per chunk when batch compressing.
+   * @param maxChunkSize maximum size of an uncompressed chunk size in bytes
+   * @return maximum compressed output size of a chunk
+   */
+  static native long batchedZstdCompressGetMaxOutputChunkSize(long maxChunkSize);
+
+  /**
+   * Asynchronously compress a batch of buffers with ZSTD. Note that
+   * compressedSizesOutPtr must point to pinned memory for this operation
+   * to be asynchronous.
+   * @param devInPtrs device address of uncompressed buffer addresses vector
+   * @param devInSizes device address of uncompressed buffer sizes vector
+   * @param chunkSize maximum size of an uncompressed chunk in bytes
+   * @param batchSize number of chunks in the batch
+   * @param tempPtr device address of the temporary workspace buffer
+   * @param tempSize size of the temporary workspace buffer in bytes
+   * @param devOutPtrs device address of output buffer addresses vector
+   * @param compressedSizesOutPtr device address where to write the sizes of the
+   *                              compressed data written to the corresponding
+   *                              output buffers. Must point to a buffer with
+   *                              at least 8 bytes of memory per output buffer
+   *                              in the batch.
+   * @param stream CUDA stream to use
+   */
+  static native void batchedZstdCompressAsync(
+      long devInPtrs,
+      long devInSizes,
+      long chunkSize,
+      long batchSize,
+      long tempPtr,
+      long tempSize,
+      long devOutPtrs,
+      long compressedSizesOutPtr,
+      long stream);
+
+  /**
+   * Computes the temporary storage size in bytes needed to decompress a
+   * ZSTD-compressed batch.
+   * @param numChunks number of chunks in the batch
+   * @param maxUncompressedChunkBytes maximum uncompressed size of any chunk in bytes
+   * @return number of temporary storage bytes needed to decompress the batch
+   */
+  static native long batchedZstdDecompressGetTempSize(
+      long numChunks,
+      long maxUncompressedChunkBytes);
+
+  /**
+   * Asynchronously decompress a batch of ZSTD-compressed data buffers.
+   * @param devInPtrs device address of compressed input buffer addresses vector
+   * @param devInSizes device address of compressed input buffer sizes vector
+   * @param devOutSizes device address of uncompressed buffer sizes vector
+   * @param batchSize number of buffers in the batch
+   * @param tempPtr device address of the temporary decompression space
+   * @param tempSize size of the temporary decompression space in bytes
+   * @param devOutPtrs device address of uncompressed output buffer addresses vector
+   * @param stream CUDA stream to use
+   */
+  static native void batchedZstdDecompressAsync(
+      long devInPtrs,
+      long devInSizes,
+      long devOutSizes,
+      long batchSize,
+      long tempPtr,
+      long tempSize,
+      long devOutPtrs,
+      long stream);
+
+  /**
+   * Asynchronously calculates the decompressed size needed for each chunk.
+   * @param devInPtrs device address of compressed input buffer addresses vector
+   * @param devInSizes device address of compressed input buffer sizes vector
+   * @param devOutSizes device address of calculated decompress sizes vector
+   * @param batchSize number of buffers in the batch
+   * @param stream CUDA stream to use
+   */
+  static native void batchedZstdGetDecompressSizeAsync(
+      long devInPtrs,
+      long devInSizes,
+      long devOutSizes,
+      long batchSize,
+      long stream);
 }
diff --git a/java/src/main/native/src/NvcompJni.cpp b/java/src/main/native/src/NvcompJni.cpp
index 47a24653549..8937438e922 100644
--- a/java/src/main/native/src/NvcompJni.cpp
+++ b/java/src/main/native/src/NvcompJni.cpp
@@ -20,6 +20,7 @@
 
 #include <nvcomp.h>
 #include <nvcomp/lz4.h>
+#include <nvcomp/zstd.h>
 
 namespace {
 
@@ -57,6 +58,7 @@ void check_nvcomp_status(JNIEnv* env, nvcompStatus_t status)
 
 extern "C" {
 
+// methods for lz4
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressGetTempSize(
   JNIEnv* env, jclass, jlong j_batch_size, jlong j_max_chunk_size)
 {
@@ -211,4 +213,158 @@ Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4GetDecompressSizeAsync(JNIEnv* en
   CATCH_STD(env, );
 }
 
+// methods for zstd
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdCompressGetTempSize(
+  JNIEnv* env, jclass, jlong j_batch_size, jlong j_max_chunk_size)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    auto batch_size       = static_cast<std::size_t>(j_batch_size);
+    auto max_chunk_size   = static_cast<std::size_t>(j_max_chunk_size);
+    std::size_t temp_size = 0;
+    auto status           = nvcompBatchedZstdCompressGetTempSize(
+      batch_size, max_chunk_size, nvcompBatchedZstdDefaultOpts, &temp_size);
+    check_nvcomp_status(env, status);
+    return static_cast<jlong>(temp_size);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdCompressGetMaxOutputChunkSize(
+  JNIEnv* env, jclass, jlong j_max_chunk_size)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    auto max_chunk_size         = static_cast<std::size_t>(j_max_chunk_size);
+    std::size_t max_output_size = 0;
+    auto status                 = nvcompBatchedZstdCompressGetMaxOutputChunkSize(
+      max_chunk_size, nvcompBatchedZstdDefaultOpts, &max_output_size);
+    check_nvcomp_status(env, status);
+    return static_cast<jlong>(max_output_size);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdCompressAsync(JNIEnv* env,
+                                                              jclass,
+                                                              jlong j_in_ptrs,
+                                                              jlong j_in_sizes,
+                                                              jlong j_chunk_size,
+                                                              jlong j_batch_size,
+                                                              jlong j_temp_ptr,
+                                                              jlong j_temp_size,
+                                                              jlong j_out_ptrs,
+                                                              jlong j_compressed_sizes_out_ptr,
+                                                              jlong j_stream)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    auto in_ptrs              = reinterpret_cast<void const* const*>(j_in_ptrs);
+    auto in_sizes             = reinterpret_cast<std::size_t const*>(j_in_sizes);
+    auto chunk_size           = static_cast<std::size_t>(j_chunk_size);
+    auto batch_size           = static_cast<std::size_t>(j_batch_size);
+    auto temp_ptr             = reinterpret_cast<void*>(j_temp_ptr);
+    auto temp_size            = static_cast<std::size_t>(j_temp_size);
+    auto out_ptrs             = reinterpret_cast<void* const*>(j_out_ptrs);
+    auto compressed_out_sizes = reinterpret_cast<std::size_t*>(j_compressed_sizes_out_ptr);
+    auto stream               = reinterpret_cast<cudaStream_t>(j_stream);
+    auto status               = nvcompBatchedZstdCompressAsync(in_ptrs,
+                                                 in_sizes,
+                                                 chunk_size,
+                                                 batch_size,
+                                                 temp_ptr,
+                                                 temp_size,
+                                                 out_ptrs,
+                                                 compressed_out_sizes,
+                                                 nvcompBatchedZstdDefaultOpts,
+                                                 stream);
+    check_nvcomp_status(env, status);
+  }
+  CATCH_STD(env, );
+}
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdDecompressGetTempSize(
+  JNIEnv* env, jclass, jlong j_batch_size, jlong j_chunk_size)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    auto batch_size       = static_cast<std::size_t>(j_batch_size);
+    auto chunk_size       = static_cast<std::size_t>(j_chunk_size);
+    std::size_t temp_size = 0;
+    auto status = nvcompBatchedZstdDecompressGetTempSize(batch_size, chunk_size, &temp_size);
+    check_nvcomp_status(env, status);
+    return static_cast<jlong>(temp_size);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdDecompressAsync(JNIEnv* env,
+                                                                jclass,
+                                                                jlong j_in_ptrs,
+                                                                jlong j_in_sizes,
+                                                                jlong j_out_sizes,
+                                                                jlong j_batch_size,
+                                                                jlong j_temp_ptr,
+                                                                jlong j_temp_size,
+                                                                jlong j_out_ptrs,
+                                                                jlong j_stream)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    auto compressed_ptrs           = reinterpret_cast<void const* const*>(j_in_ptrs);
+    auto compressed_sizes          = reinterpret_cast<std::size_t const*>(j_in_sizes);
+    auto uncompressed_sizes        = reinterpret_cast<std::size_t const*>(j_out_sizes);
+    auto batch_size                = static_cast<std::size_t>(j_batch_size);
+    auto temp_ptr                  = reinterpret_cast<void*>(j_temp_ptr);
+    auto temp_size                 = static_cast<std::size_t>(j_temp_size);
+    auto uncompressed_ptrs         = reinterpret_cast<void* const*>(j_out_ptrs);
+    auto stream                    = reinterpret_cast<cudaStream_t>(j_stream);
+    auto uncompressed_statuses     = rmm::device_uvector<nvcompStatus_t>(batch_size, stream);
+    auto actual_uncompressed_sizes = rmm::device_uvector<std::size_t>(batch_size, stream);
+    auto status                    = nvcompBatchedZstdDecompressAsync(compressed_ptrs,
+                                                   compressed_sizes,
+                                                   uncompressed_sizes,
+                                                   actual_uncompressed_sizes.data(),
+                                                   batch_size,
+                                                   temp_ptr,
+                                                   temp_size,
+                                                   uncompressed_ptrs,
+                                                   uncompressed_statuses.data(),
+                                                   stream);
+    check_nvcomp_status(env, status);
+    if (!cudf::java::check_nvcomp_output_sizes(
+          uncompressed_sizes, actual_uncompressed_sizes.data(), batch_size, stream)) {
+      cudf::jni::throw_java_exception(
+        env, NVCOMP_ERROR_CLASS, "nvcomp decompress output size mismatch");
+    }
+  }
+  CATCH_STD(env, );
+}
+
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdGetDecompressSizeAsync(JNIEnv* env,
+                                                                       jclass,
+                                                                       jlong j_in_ptrs,
+                                                                       jlong j_in_sizes,
+                                                                       jlong j_out_sizes,
+                                                                       jlong j_batch_size,
+                                                                       jlong j_stream)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    auto compressed_ptrs    = reinterpret_cast<void const* const*>(j_in_ptrs);
+    auto compressed_sizes   = reinterpret_cast<std::size_t const*>(j_in_sizes);
+    auto uncompressed_sizes = reinterpret_cast<std::size_t*>(j_out_sizes);
+    auto batch_size         = static_cast<std::size_t>(j_batch_size);
+    auto stream             = reinterpret_cast<cudaStream_t>(j_stream);
+    auto status             = nvcompBatchedZstdGetDecompressSizeAsync(
+      compressed_ptrs, compressed_sizes, uncompressed_sizes, batch_size, stream);
+    check_nvcomp_status(env, status);
+  }
+  CATCH_STD(env, );
+}
+
 }  // extern "C"
diff --git a/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java b/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java
index 66f4fe39109..4e8fc225257 100644
--- a/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java
+++ b/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,18 +23,29 @@
 import org.slf4j.LoggerFactory;
 
 import java.util.Arrays;
-import java.util.Optional;
 
 public class NvcompTest {
   private static final HostMemoryAllocator hostMemoryAllocator = DefaultHostMemoryAllocator.get();
 
   private static final Logger log = LoggerFactory.getLogger(ColumnVector.class);
 
+  private final long chunkSize = 64 * 1024;
+  private final long targetIntermediteSize = Long.MAX_VALUE;
+
   @Test
   void testBatchedLZ4RoundTripAsync() {
+    testBatchedRoundTripAsync(new BatchedLZ4Compressor(chunkSize, targetIntermediteSize),
+        new BatchedLZ4Decompressor(chunkSize));
+  }
+
+  @Test
+  void testBatchedZstdRoundTripAsync() {
+    testBatchedRoundTripAsync(new BatchedZstdCompressor(chunkSize, targetIntermediteSize),
+        new BatchedZstdDecompressor(chunkSize));
+  }
+
+  void testBatchedRoundTripAsync(BatchedCompressor comp, BatchedDecompressor decomp) {
     final Cuda.Stream stream = Cuda.DEFAULT_STREAM;
-    final long chunkSize = 64 * 1024;
-    final long targetIntermediteSize = Long.MAX_VALUE;
     final int maxElements = 1024 * 1024 + 1;
     final int numBuffers = 200;
     long[] data = new long[maxElements];
@@ -52,10 +63,8 @@ void testBatchedLZ4RoundTripAsync() {
       }
 
       // compress and decompress the buffers
-      BatchedLZ4Compressor compressor = new BatchedLZ4Compressor(chunkSize, targetIntermediteSize);
-
       try (CloseableArray<DeviceMemoryBuffer> compressedBuffers =
-               CloseableArray.wrap(compressor.compress(originalBuffers.getArray(), stream));
+               CloseableArray.wrap(comp.compress(originalBuffers.getArray(), stream));
            CloseableArray<DeviceMemoryBuffer> uncompressedBuffers =
                CloseableArray.wrap(new DeviceMemoryBuffer[numBuffers])) {
         for (int i = 0; i < numBuffers; i++) {
@@ -64,8 +73,8 @@ void testBatchedLZ4RoundTripAsync() {
         }
 
         // decompress takes ownership of the compressed buffers and will close them
-        BatchedLZ4Decompressor.decompressAsync(chunkSize, compressedBuffers.release(),
-            uncompressedBuffers.getArray(), stream);
+        decomp.decompressAsync(compressedBuffers.release(), uncompressedBuffers.getArray(),
+            stream);
 
         // check the decompressed results against the original
         for (int i = 0; i < numBuffers; ++i) {

From 4069c8223e2131130295f488ce363af82ead4be5 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 14 May 2024 06:39:55 -1000
Subject: [PATCH 217/272] Validate and materialize iterators earlier in
 as_column (#15739)

closes #8796

I left a `TODO` in `as_column` to validate earlier that `arbitrary` is an iterable or sequence like if it wasn't a recognized array like (e.g. numpy array, pandas object, etc). Additionally, ensure we materialize iterators since there are some checks that would exhaust the object

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15739
---
 python/cudf/cudf/core/column/column.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 3754ed1392e..371c91dd96f 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2070,8 +2070,15 @@ def as_column(
         except (ValueError, TypeError):
             arbitrary = np.asarray(arbitrary)
         return as_column(arbitrary, dtype=dtype, nan_as_null=nan_as_null)
+    elif not isinstance(arbitrary, (abc.Iterable, abc.Sequence)):
+        raise TypeError(
+            f"{type(arbitrary).__name__} must be an iterable or sequence."
+        )
+    elif isinstance(arbitrary, abc.Iterator):
+        arbitrary = list(arbitrary)
+
     # Start of arbitrary that's not handed above but dtype provided
-    elif isinstance(dtype, pd.DatetimeTZDtype):
+    if isinstance(dtype, pd.DatetimeTZDtype):
         raise NotImplementedError(
             "Use `tz_localize()` to construct timezone aware data."
         )
@@ -2127,11 +2134,7 @@ def as_column(
                 return cudf.core.column.ListColumn.from_sequences(arbitrary)
             raise
         return as_column(data, nan_as_null=nan_as_null)
-    elif not isinstance(arbitrary, (abc.Iterable, abc.Sequence)):
-        # TODO: This validation should probably be done earlier?
-        raise TypeError(
-            f"{type(arbitrary).__name__} must be an iterable or sequence."
-        )
+
     from_pandas = nan_as_null is None or nan_as_null
     if dtype is not None:
         dtype = cudf.dtype(dtype)
@@ -2147,7 +2150,6 @@ def as_column(
             arbitrary = pd.Series(arbitrary, dtype=dtype)
         return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)
     else:
-        arbitrary = list(arbitrary)
         for element in arbitrary:
             # Carve-outs that cannot be parsed by pyarrow/pandas
             if is_column_like(element):

From cbe277568c42c122a69fdad012e98580a0bb3d71 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 14 May 2024 12:20:26 -0500
Subject: [PATCH 218/272] Fix `Index.repeat` for `datetime64` types (#15722)

Fixes: #15720

This PR fixes `Index.repeat` where the `freq` of `DatetimeIndex` needs to be reset.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15722
---
 python/cudf/cudf/core/index.py         |  5 +++++
 python/cudf/cudf/core/indexed_frame.py |  5 ++++-
 python/cudf/cudf/tests/test_index.py   | 14 ++++++++++++++
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 0710f0f5c42..209e582e5d6 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2373,6 +2373,11 @@ def tz_convert(self, tz: str | None):
         result_col = self._column.tz_convert(tz)
         return DatetimeIndex._from_data({self.name: result_col})
 
+    def repeat(self, repeats, axis=None):
+        res = super().repeat(repeats, axis=axis)
+        res._freq = None
+        return res
+
 
 class TimedeltaIndex(Index):
     """
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index dc261707867..904cd0c69c2 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -4871,13 +4871,16 @@ def repeat(self, repeats, axis=None):
         1    2
         dtype: int64
         """
-        return self._from_columns_like_self(
+        res = self._from_columns_like_self(
             Frame._repeat(
                 [*self._index._data.columns, *self._columns], repeats, axis
             ),
             self._column_names,
             self._index_names,
         )
+        if isinstance(res.index, cudf.DatetimeIndex):
+            res.index._freq = None
+        return res
 
     def astype(
         self,
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 8b7ee1dccf8..8e7532d044d 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -3252,3 +3252,17 @@ def test_Index_init_with_nans():
     assert gi.dtype == np.dtype("float64")
     pi = pd.Index([1, 2, 3, np.nan])
     assert_eq(pi, gi)
+
+
+def test_index_datetime_repeat():
+    gidx = cudf.date_range("2021-01-01", periods=3, freq="D")
+    pidx = gidx.to_pandas()
+
+    actual = gidx.repeat(5)
+    expected = pidx.repeat(5)
+
+    assert_eq(actual, expected)
+
+    actual = gidx.to_frame().repeat(5)
+
+    assert_eq(actual.index, expected)

From 2fb8efb38a71490d0ebaaa4f4fea37591cf02917 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 14 May 2024 10:52:31 -0700
Subject: [PATCH 219/272] Migrate all cpp pxd files into pylibcudf (#15740)

This PR is a mass migration of all the Cython headers exposing libcudf to Cython into the pylibcudf subpackage. This will facilitate splitting out pylibcudf from cudf, and it should also allow us to do some cleanups sooner than that with respect to our imports since this preempts any concerns with circular imports (cudf->pylibcudf->cudf._lib.cpp).

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15740
---
 python/cudf/cudf/_lib/CMakeLists.txt          |  1 -
 python/cudf/cudf/_lib/avro.pyx                | 10 +++---
 python/cudf/cudf/_lib/column.pxd              | 11 ++++---
 python/cudf/cudf/_lib/column.pyx              | 20 +++++------
 python/cudf/cudf/_lib/copying.pxd             |  4 +--
 python/cudf/cudf/_lib/copying.pyx             | 16 +++++----
 .../cudf/_lib/cpp/lists/count_elements.pxd    | 10 ------
 python/cudf/cudf/_lib/cpp/lists/explode.pxd   | 14 --------
 python/cudf/cudf/_lib/cpp/strings/extract.pxd | 15 ---------
 python/cudf/cudf/_lib/cpp/strings/findall.pxd | 14 --------
 python/cudf/cudf/_lib/cpp/strings/strip.pxd   | 16 ---------
 python/cudf/cudf/_lib/cpp/strings/wrap.pxd    | 14 --------
 python/cudf/cudf/_lib/csv.pyx                 | 16 ++++-----
 python/cudf/cudf/_lib/datetime.pyx            | 12 +++----
 python/cudf/cudf/_lib/expressions.pxd         |  4 +--
 python/cudf/cudf/_lib/expressions.pyx         |  9 +++--
 python/cudf/cudf/_lib/groupby.pyx             |  4 +--
 python/cudf/cudf/_lib/hash.pyx                | 14 ++++----
 python/cudf/cudf/_lib/interop.pyx             |  6 ++--
 python/cudf/cudf/_lib/io/datasource.pxd       |  6 ++--
 python/cudf/cudf/_lib/io/datasource.pyx       |  6 ++--
 python/cudf/cudf/_lib/io/utils.pxd            | 10 ++++--
 python/cudf/cudf/_lib/io/utils.pyx            |  8 ++---
 python/cudf/cudf/_lib/json.pyx                | 22 ++++++-------
 python/cudf/cudf/_lib/labeling.pyx            | 11 ++++---
 python/cudf/cudf/_lib/lists.pyx               | 33 ++++++++++++-------
 python/cudf/cudf/_lib/null_mask.pyx           | 10 +++---
 .../cudf/_lib/nvtext/byte_pair_encode.pyx     | 10 +++---
 .../cudf/cudf/_lib/nvtext/edit_distance.pyx   |  8 ++---
 .../cudf/cudf/_lib/nvtext/generate_ngrams.pyx | 12 +++----
 python/cudf/cudf/_lib/nvtext/jaccard.pyx      | 12 ++++---
 python/cudf/cudf/_lib/nvtext/minhash.pyx      | 10 +++---
 .../cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx | 12 +++----
 python/cudf/cudf/_lib/nvtext/normalize.pyx    |  8 ++---
 python/cudf/cudf/_lib/nvtext/replace.pyx      | 12 +++----
 python/cudf/cudf/_lib/nvtext/stemmer.pyx      | 10 +++---
 .../cudf/_lib/nvtext/subword_tokenize.pyx     |  6 ++--
 python/cudf/cudf/_lib/nvtext/tokenize.pyx     | 12 +++----
 python/cudf/cudf/_lib/orc.pyx                 | 28 ++++++++--------
 python/cudf/cudf/_lib/parquet.pyx             | 33 ++++++++++---------
 python/cudf/cudf/_lib/partitioning.pyx        | 14 ++++----
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |  1 +
 .../cudf/cudf/_lib/pylibcudf/aggregation.pxd  |  4 +--
 .../cudf/cudf/_lib/pylibcudf/aggregation.pyx  | 16 +++++----
 python/cudf/cudf/_lib/pylibcudf/binaryop.pxd  |  2 +-
 python/cudf/cudf/_lib/pylibcudf/binaryop.pyx  |  8 ++---
 python/cudf/cudf/_lib/pylibcudf/column.pxd    |  9 +++--
 python/cudf/cudf/_lib/pylibcudf/column.pyx    | 10 +++---
 .../cudf/cudf/_lib/pylibcudf/concatenate.pyx  | 10 +++---
 python/cudf/cudf/_lib/pylibcudf/copying.pxd   |  7 ++--
 python/cudf/cudf/_lib/pylibcudf/copying.pyx   | 28 +++++++++-------
 python/cudf/cudf/_lib/pylibcudf/filling.pxd   |  2 +-
 python/cudf/cudf/_lib/pylibcudf/filling.pyx   |  8 ++---
 python/cudf/cudf/_lib/pylibcudf/groupby.pxd   |  6 ++--
 python/cudf/cudf/_lib/pylibcudf/groupby.pyx   | 10 +++---
 python/cudf/cudf/_lib/pylibcudf/interop.pyx   | 11 ++++---
 python/cudf/cudf/_lib/pylibcudf/join.pxd      |  2 +-
 python/cudf/cudf/_lib/pylibcudf/join.pyx      | 11 +++++--
 .../{cpp => pylibcudf/libcudf}/CMakeLists.txt |  0
 .../{cpp => pylibcudf/libcudf}/__init__.pxd   |  0
 .../{cpp => pylibcudf/libcudf}/__init__.py    |  0
 .../libcudf}/aggregation.pxd                  |  2 +-
 .../libcudf}/aggregation.pyx                  |  0
 .../{cpp => pylibcudf/libcudf}/binaryop.pxd   |  8 ++---
 .../{cpp => pylibcudf/libcudf}/binaryop.pyx   |  0
 .../libcudf}/column/__init__.pxd              |  0
 .../libcudf}/column/__init__.py               |  0
 .../libcudf}/column/column.pxd                |  9 +++--
 .../libcudf}/column/column_factories.pxd      |  8 ++---
 .../libcudf}/column/column_view.pxd           |  8 +++--
 .../libcudf}/concatenate.pxd                  |  6 ++--
 .../libcudf}/contiguous_split.pxd             |  6 ++--
 .../{cpp => pylibcudf/libcudf}/copying.pxd    | 15 +++++----
 .../{cpp => pylibcudf/libcudf}/copying.pyx    |  0
 .../{cpp => pylibcudf/libcudf}/datetime.pxd   |  8 ++---
 .../libcudf}/expressions.pxd                  | 10 +++---
 .../{cpp => pylibcudf/libcudf}/filling.pxd    | 17 ++++++----
 .../{cpp => pylibcudf/libcudf}/groupby.pxd    | 18 +++++-----
 .../_lib/{cpp => pylibcudf/libcudf}/hash.pxd  |  6 ++--
 .../{cpp => pylibcudf/libcudf}/interop.pxd    |  8 ++---
 .../libcudf}/io/__init__.pxd                  |  0
 .../{cpp => pylibcudf/libcudf}/io/__init__.py |  0
 .../libcudf}/io/arrow_io_source.pxd           |  4 +--
 .../{cpp => pylibcudf/libcudf}/io/avro.pxd    |  6 ++--
 .../{cpp => pylibcudf/libcudf}/io/csv.pxd     |  8 ++---
 .../libcudf}/io/data_sink.pxd                 |  0
 .../libcudf}/io/datasource.pxd                |  0
 .../{cpp => pylibcudf/libcudf}/io/json.pxd    |  6 ++--
 .../{cpp => pylibcudf/libcudf}/io/orc.pxd     |  6 ++--
 .../libcudf}/io/orc_metadata.pxd              |  2 +-
 .../{cpp => pylibcudf/libcudf}/io/parquet.pxd |  8 ++---
 .../libcudf}/io/parquet_metadata.pxd          |  4 +--
 .../{cpp => pylibcudf/libcudf}/io/text.pxd    |  4 +--
 .../libcudf}/io/timezone.pxd                  |  4 +--
 .../{cpp => pylibcudf/libcudf}/io/types.pxd   | 12 +++----
 .../_lib/{cpp => pylibcudf/libcudf}/join.pxd  |  8 ++---
 .../{cpp => pylibcudf/libcudf}/labeling.pxd   |  6 ++--
 .../libcudf}/lists/__init__.pxd               |  0
 .../libcudf}/lists/__init__.py                |  0
 .../libcudf}/lists/combine.pxd                |  8 ++---
 .../libcudf}/lists/contains.pxd               | 12 ++++---
 .../libcudf/lists/count_elements.pxd          | 12 +++++++
 .../_lib/pylibcudf/libcudf/lists/explode.pxd  | 14 ++++++++
 .../libcudf}/lists/extract.pxd                | 10 +++---
 .../libcudf}/lists/gather.pxd                 |  8 +++--
 .../libcudf}/lists/lists_column_view.pxd      |  9 +++--
 .../libcudf}/lists/sorting.pxd                | 10 +++---
 .../libcudf}/lists/stream_compaction.pxd      | 10 +++---
 .../_lib/{cpp => pylibcudf/libcudf}/merge.pxd |  8 ++---
 .../{cpp => pylibcudf/libcudf}/null_mask.pxd  | 12 ++++---
 .../libcudf}/nvtext/__init__.pxd              |  0
 .../libcudf}/nvtext/__init__.py               |  0
 .../libcudf}/nvtext/byte_pair_encode.pxd      |  8 ++---
 .../libcudf}/nvtext/edit_distance.pxd         |  6 ++--
 .../libcudf}/nvtext/generate_ngrams.pxd       | 10 +++---
 .../libcudf}/nvtext/jaccard.pxd               |  8 ++---
 .../libcudf}/nvtext/minhash.pxd               |  8 ++---
 .../libcudf}/nvtext/ngrams_tokenize.pxd       | 10 +++---
 .../libcudf}/nvtext/normalize.pxd             |  6 ++--
 .../libcudf}/nvtext/replace.pxd               | 10 +++---
 .../libcudf}/nvtext/stemmer.pxd               |  8 ++---
 .../libcudf}/nvtext/subword_tokenize.pxd      |  6 ++--
 .../libcudf}/nvtext/tokenize.pxd              | 10 +++---
 .../libcudf}/partitioning.pxd                 | 12 +++----
 .../{cpp => pylibcudf/libcudf}/quantiles.pxd  | 12 +++----
 .../{cpp => pylibcudf/libcudf}/reduce.pxd     | 13 +++++---
 .../{cpp => pylibcudf/libcudf}/reduce.pyx     |  0
 .../{cpp => pylibcudf/libcudf}/replace.pxd    |  9 +++--
 .../{cpp => pylibcudf/libcudf}/replace.pyx    |  0
 .../{cpp => pylibcudf/libcudf}/reshape.pxd    | 10 +++---
 .../{cpp => pylibcudf/libcudf}/rolling.pxd    |  8 ++---
 .../_lib/{cpp => pylibcudf/libcudf}/round.pxd |  6 ++--
 .../libcudf}/scalar/__init__.pxd              |  0
 .../libcudf}/scalar/__init__.py               |  0
 .../libcudf}/scalar/scalar.pxd                | 10 +++---
 .../{cpp => pylibcudf/libcudf}/search.pxd     | 10 +++---
 .../{cpp => pylibcudf/libcudf}/sorting.pxd    | 12 +++----
 .../libcudf}/stream_compaction.pxd            | 10 +++---
 .../libcudf}/stream_compaction.pyx            |  0
 .../libcudf}/strings/__init__.pxd             |  0
 .../libcudf}/strings/__init__.py              |  0
 .../libcudf}/strings/attributes.pxd           |  6 ++--
 .../libcudf}/strings/capitalize.pxd           |  6 ++--
 .../libcudf}/strings/case.pxd                 |  6 ++--
 .../libcudf}/strings/char_types.pxd           |  8 ++---
 .../libcudf}/strings/combine.pxd              | 10 +++---
 .../libcudf}/strings/contains.pxd             | 10 +++---
 .../libcudf}/strings/convert/__init__.pxd     |  0
 .../libcudf}/strings/convert/__init__.py      |  0
 .../strings/convert/convert_booleans.pxd      |  8 ++---
 .../strings/convert/convert_datetime.pxd      |  8 ++---
 .../strings/convert/convert_durations.pxd     |  8 ++---
 .../strings/convert/convert_fixed_point.pxd   |  8 ++---
 .../strings/convert/convert_floats.pxd        |  8 ++---
 .../strings/convert/convert_integers.pxd      |  8 ++---
 .../libcudf}/strings/convert/convert_ipv4.pxd |  6 ++--
 .../strings/convert/convert_lists.pxd         |  8 ++---
 .../libcudf}/strings/convert/convert_urls.pxd |  6 ++--
 .../pylibcudf/libcudf/strings/extract.pxd     | 15 +++++++++
 .../libcudf}/strings/find.pxd                 |  8 ++---
 .../libcudf}/strings/find_multiple.pxd        |  6 ++--
 .../pylibcudf/libcudf/strings/findall.pxd     | 14 ++++++++
 .../libcudf}/strings/json.pxd                 |  8 ++---
 .../libcudf}/strings/padding.pxd              | 12 +++----
 .../libcudf}/strings/regex_flags.pxd          |  0
 .../libcudf}/strings/regex_program.pxd        |  4 +--
 .../libcudf}/strings/repeat.pxd               |  8 ++---
 .../libcudf}/strings/replace.pxd              | 10 +++---
 .../libcudf}/strings/replace_re.pxd           | 14 ++++----
 .../libcudf}/strings/side_type.pxd            |  0
 .../libcudf}/strings/split/__init__.pxd       |  0
 .../libcudf}/strings/split/__init__.py        |  0
 .../libcudf}/strings/split/partition.pxd      | 10 +++---
 .../libcudf}/strings/split/split.pxd          | 14 ++++----
 .../_lib/pylibcudf/libcudf/strings/strip.pxd  | 16 +++++++++
 .../libcudf}/strings/substring.pxd            | 10 +++---
 .../libcudf}/strings/translate.pxd            | 10 +++---
 .../_lib/pylibcudf/libcudf/strings/wrap.pxd   | 14 ++++++++
 .../libcudf}/strings_udf.pxd                  |  8 ++---
 .../libcudf}/table/__init__.pxd               |  0
 .../libcudf}/table/__init__.py                |  0
 .../libcudf}/table/table.pxd                  | 11 ++++---
 .../libcudf}/table/table_view.pxd             |  9 +++--
 .../{cpp => pylibcudf/libcudf}/transform.pxd  | 18 ++++++----
 .../{cpp => pylibcudf/libcudf}/transpose.pxd  |  6 ++--
 .../_lib/{cpp => pylibcudf/libcudf}/types.pxd |  0
 .../_lib/{cpp => pylibcudf/libcudf}/types.pyx |  0
 .../_lib/{cpp => pylibcudf/libcudf}/unary.pxd |  6 ++--
 .../_lib/{cpp => pylibcudf/libcudf}/unary.pyx |  0
 .../libcudf}/utilities/__init__.pxd           |  0
 .../libcudf}/utilities/__init__.py            |  0
 .../libcudf}/utilities/host_span.pxd          |  0
 .../libcudf}/wrappers/__init__.pxd            |  0
 .../libcudf}/wrappers/__init__.py             |  0
 .../libcudf}/wrappers/decimals.pxd            |  4 +--
 .../libcudf}/wrappers/durations.pxd           |  0
 .../libcudf}/wrappers/timestamps.pxd          |  0
 python/cudf/cudf/_lib/pylibcudf/lists.pxd     |  2 +-
 python/cudf/cudf/_lib/pylibcudf/lists.pyx     |  6 ++--
 python/cudf/cudf/_lib/pylibcudf/merge.pyx     |  8 ++---
 python/cudf/cudf/_lib/pylibcudf/reduce.pxd    |  2 +-
 python/cudf/cudf/_lib/pylibcudf/reduce.pyx    | 16 +++++----
 python/cudf/cudf/_lib/pylibcudf/replace.pxd   |  2 +-
 python/cudf/cudf/_lib/pylibcudf/replace.pyx   |  6 ++--
 python/cudf/cudf/_lib/pylibcudf/rolling.pxd   |  2 +-
 python/cudf/cudf/_lib/pylibcudf/rolling.pyx   |  8 ++---
 python/cudf/cudf/_lib/pylibcudf/scalar.pxd    |  2 +-
 python/cudf/cudf/_lib/pylibcudf/scalar.pyx    |  2 +-
 python/cudf/cudf/_lib/pylibcudf/search.pyx    |  6 ++--
 python/cudf/cudf/_lib/pylibcudf/sorting.pxd   |  9 +++--
 python/cudf/cudf/_lib/pylibcudf/sorting.pyx   | 10 +++---
 .../cudf/_lib/pylibcudf/stream_compaction.pxd |  6 ++--
 .../cudf/_lib/pylibcudf/stream_compaction.pyx | 16 +++++----
 .../cudf/cudf/_lib/pylibcudf/strings/case.pyx |  4 +--
 .../cudf/cudf/_lib/pylibcudf/strings/find.pxd |  2 +-
 .../cudf/cudf/_lib/pylibcudf/strings/find.pyx |  6 ++--
 python/cudf/cudf/_lib/pylibcudf/table.pxd     |  4 +--
 python/cudf/cudf/_lib/pylibcudf/table.pyx     |  6 ++--
 python/cudf/cudf/_lib/pylibcudf/types.pxd     |  2 +-
 python/cudf/cudf/_lib/pylibcudf/types.pyx     | 20 +++++------
 python/cudf/cudf/_lib/pylibcudf/unary.pxd     |  2 +-
 python/cudf/cudf/_lib/pylibcudf/unary.pyx     |  8 ++---
 python/cudf/cudf/_lib/pylibcudf/utils.pxd     |  4 +--
 python/cudf/cudf/_lib/pylibcudf/utils.pyx     |  4 +--
 python/cudf/cudf/_lib/quantiles.pyx           | 19 +++++++----
 python/cudf/cudf/_lib/reshape.pyx             | 12 +++----
 python/cudf/cudf/_lib/round.pyx               |  8 ++---
 python/cudf/cudf/_lib/scalar.pxd              |  2 +-
 python/cudf/cudf/_lib/scalar.pyx              | 18 +++++-----
 python/cudf/cudf/_lib/sort.pyx                | 10 +++---
 python/cudf/cudf/_lib/string_casting.pyx      | 20 +++++------
 python/cudf/cudf/_lib/strings/attributes.pyx  |  8 ++---
 python/cudf/cudf/_lib/strings/capitalize.pyx  |  8 ++---
 python/cudf/cudf/_lib/strings/char_types.pyx  | 10 +++---
 python/cudf/cudf/_lib/strings/combine.pyx     | 12 +++----
 python/cudf/cudf/_lib/strings/contains.pyx    | 14 ++++----
 .../strings/convert/convert_fixed_point.pyx   | 10 +++---
 .../_lib/strings/convert/convert_floats.pyx   |  8 ++---
 .../_lib/strings/convert/convert_integers.pyx |  8 ++---
 .../_lib/strings/convert/convert_lists.pyx    | 10 +++---
 .../_lib/strings/convert/convert_urls.pyx     |  8 ++---
 python/cudf/cudf/_lib/strings/extract.pyx     | 12 +++----
 python/cudf/cudf/_lib/strings/find.pyx        |  2 +-
 .../cudf/cudf/_lib/strings/find_multiple.pyx  |  8 ++---
 python/cudf/cudf/_lib/strings/findall.pyx     | 12 +++----
 python/cudf/cudf/_lib/strings/json.pyx        | 10 +++---
 python/cudf/cudf/_lib/strings/padding.pyx     | 15 +++++----
 python/cudf/cudf/_lib/strings/repeat.pyx      | 10 +++---
 python/cudf/cudf/_lib/strings/replace.pyx     | 12 +++----
 python/cudf/cudf/_lib/strings/replace_re.pyx  | 16 ++++-----
 .../cudf/_lib/strings/split/partition.pyx     | 10 +++---
 python/cudf/cudf/_lib/strings/split/split.pyx | 18 +++++-----
 python/cudf/cudf/_lib/strings/strip.pyx       | 12 +++----
 python/cudf/cudf/_lib/strings/substring.pyx   | 14 ++++----
 python/cudf/cudf/_lib/strings/translate.pyx   | 12 +++----
 python/cudf/cudf/_lib/strings/wrap.pyx        | 10 +++---
 python/cudf/cudf/_lib/strings_udf.pyx         |  8 ++---
 python/cudf/cudf/_lib/text.pyx                |  6 ++--
 python/cudf/cudf/_lib/timezone.pyx            |  6 ++--
 python/cudf/cudf/_lib/transform.pyx           | 21 +++++++-----
 python/cudf/cudf/_lib/transpose.pyx           |  8 ++---
 python/cudf/cudf/_lib/types.pxd               | 10 +++---
 python/cudf/cudf/_lib/types.pyx               |  8 +++--
 python/cudf/cudf/_lib/utils.pxd               |  4 +--
 python/cudf/cudf/_lib/utils.pyx               |  8 ++---
 python/cudf_kafka/cudf_kafka/_lib/kafka.pxd   |  4 +--
 python/cudf_kafka/cudf_kafka/_lib/kafka.pyx   |  4 +--
 267 files changed, 1134 insertions(+), 995 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/cpp/lists/count_elements.pxd
 delete mode 100644 python/cudf/cudf/_lib/cpp/lists/explode.pxd
 delete mode 100644 python/cudf/cudf/_lib/cpp/strings/extract.pxd
 delete mode 100644 python/cudf/cudf/_lib/cpp/strings/findall.pxd
 delete mode 100644 python/cudf/cudf/_lib/cpp/strings/strip.pxd
 delete mode 100644 python/cudf/cudf/_lib/cpp/strings/wrap.pxd
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/CMakeLists.txt (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/__init__.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/__init__.py (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/aggregation.pxd (98%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/aggregation.pyx (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/binaryop.pxd (84%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/binaryop.pyx (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/column/__init__.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/column/__init__.py (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/column/column.pxd (78%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/column/column_factories.pxd (65%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/column/column_view.pxd (95%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/concatenate.pxd (77%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/contiguous_split.pxd (80%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/copying.pxd (91%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/copying.pyx (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/datetime.pxd (90%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/expressions.pxd (92%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/filling.pxd (67%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/groupby.pxd (83%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/hash.pxd (86%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/interop.pxd (83%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/__init__.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/__init__.py (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/arrow_io_source.pxd (77%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/avro.pxd (88%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/csv.pxd (97%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/data_sink.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/datasource.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/json.pxd (96%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/orc.pxd (97%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/orc_metadata.pxd (96%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/parquet.pxd (97%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/parquet_metadata.pxd (89%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/text.pxd (93%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/timezone.pxd (75%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/types.pxd (91%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/join.pxd (87%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/labeling.pxd (71%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/lists/__init__.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/lists/__init__.py (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/lists/combine.pxd (73%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/lists/contains.pxd (63%)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/lists/explode.pxd
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/lists/extract.pxd (56%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/lists/gather.pxd (57%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/lists/lists_column_view.pxd (69%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/lists/sorting.pxd (51%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/lists/stream_compaction.pxd (52%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/merge.pxd (62%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/null_mask.pxd (76%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/__init__.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/__init__.py (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/byte_pair_encode.pxd (67%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/edit_distance.pxd (67%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/generate_ngrams.pxd (63%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/jaccard.pxd (52%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/minhash.pxd (63%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/ngrams_tokenize.pxd (50%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/normalize.pxd (66%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/replace.pxd (63%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/stemmer.pxd (74%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/subword_tokenize.pxd (89%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/tokenize.pxd (81%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/partitioning.pxd (64%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/quantiles.pxd (65%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/reduce.pxd (69%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/reduce.pyx (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/replace.pxd (85%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/replace.pyx (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/reshape.pxd (50%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/rolling.pxd (71%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/round.pxd (71%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/scalar/__init__.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/scalar/__init__.py (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/scalar/scalar.pxd (89%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/search.pxd (69%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/sorting.pxd (86%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/stream_compaction.pxd (88%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/stream_compaction.pyx (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/__init__.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/__init__.py (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/attributes.pxd (68%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/capitalize.pxd (67%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/case.pxd (67%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/char_types.pxd (84%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/combine.pxd (80%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/contains.pxd (64%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/convert/__init__.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/convert/__init__.py (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/convert/convert_booleans.pxd (62%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/convert/convert_datetime.pxd (70%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/convert/convert_durations.pxd (65%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/convert/convert_fixed_point.pxd (66%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/convert/convert_floats.pxd (64%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/convert/convert_integers.pxd (75%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/convert/convert_ipv4.pxd (69%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/convert/convert_lists.pxd (53%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/convert/convert_urls.pxd (62%)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/strings/extract.pxd
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/find.pxd (83%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/find_multiple.pxd (58%)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/strings/findall.pxd
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/json.pxd (75%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/padding.pxd (54%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/regex_flags.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/regex_program.pxd (74%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/repeat.pxd (60%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/replace.pxd (68%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/replace_re.pxd (59%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/side_type.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/split/__init__.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/split/__init__.py (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/split/partition.pxd (56%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/split/split.pxd (76%)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/strings/strip.pxd
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/substring.pxd (60%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/translate.pxd (69%)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/strings/wrap.pxd
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings_udf.pxd (81%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/table/__init__.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/table/__init__.py (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/table/table.pxd (62%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/table/table_view.pxd (77%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/transform.pxd (70%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/transpose.pxd (59%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/types.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/types.pyx (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/unary.pxd (84%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/unary.pyx (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/utilities/__init__.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/utilities/__init__.py (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/utilities/host_span.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/wrappers/__init__.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/wrappers/__init__.py (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/wrappers/decimals.pxd (82%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/wrappers/durations.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/wrappers/timestamps.pxd (100%)

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 07f334fdc12..5a067e84f56 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -68,7 +68,6 @@ target_link_libraries(strings_udf PUBLIC cudf_strings_udf)
 set(targets_using_arrow_headers interop avro csv orc json parquet)
 link_to_pyarrow_headers("${targets_using_arrow_headers}")
 
-add_subdirectory(cpp)
 add_subdirectory(io)
 add_subdirectory(nvtext)
 add_subdirectory(pylibcudf)
diff --git a/python/cudf/cudf/_lib/avro.pyx b/python/cudf/cudf/_lib/avro.pyx
index 0e24b5b7459..ae17a5f1ab6 100644
--- a/python/cudf/cudf/_lib/avro.pyx
+++ b/python/cudf/cudf/_lib/avro.pyx
@@ -1,16 +1,16 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.io.avro cimport (
+from cudf._lib.io.utils cimport make_source_info
+from cudf._lib.pylibcudf.libcudf.io.avro cimport (
     avro_reader_options,
     read_avro as libcudf_read_avro,
 )
-from cudf._lib.cpp.io.types cimport table_with_metadata
-from cudf._lib.cpp.types cimport size_type
-from cudf._lib.io.utils cimport make_source_info
+from cudf._lib.pylibcudf.libcudf.io.types cimport table_with_metadata
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.utils cimport data_from_unique_ptr
 
 
diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd
index 7ffb55a6cc6..437f44af9f0 100644
--- a/python/cudf/cudf/_lib/column.pxd
+++ b/python/cudf/cudf/_lib/column.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from typing import Literal
 
@@ -7,9 +7,12 @@ from libcpp.memory cimport unique_ptr
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef class Column:
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 9c48a731cea..f33e121241d 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -43,18 +43,18 @@ from cudf._lib.types import dtype_from_pylibcudf_column
 # from_pylibcudf by instead creating an empty numeric column. We will be able
 # to remove this once column factories are exposed to pylibcudf.
 
-cimport cudf._lib.cpp.copying as cpp_copying
-cimport cudf._lib.cpp.types as libcudf_types
-cimport cudf._lib.cpp.unary as libcudf_unary
-from cudf._lib cimport pylibcudf
-from cudf._lib.cpp.column.column cimport column, column_contents
-from cudf._lib.cpp.column.column_factories cimport (
+cimport cudf._lib.pylibcudf.libcudf.copying as cpp_copying
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+cimport cudf._lib.pylibcudf.libcudf.unary as libcudf_unary
+from cudf._lib.pylibcudf cimport Column as plc_Column
+from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_contents
+from cudf._lib.pylibcudf.libcudf.column.column_factories cimport (
     make_column_from_scalar as cpp_make_column_from_scalar,
     make_numeric_column,
 )
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.null_mask cimport null_count as cpp_null_count
-from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.null_mask cimport null_count as cpp_null_count
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 from cudf._lib.scalar cimport DeviceScalar
 
 
@@ -633,7 +633,7 @@ cdef class Column:
             # TODO: This function call is what requires cimporting pylibcudf.
             # We can remove the cimport once we can directly do
             # pylibcudf.column_factories.make_numeric_column or equivalent.
-            col = pylibcudf.Column.from_libcudf(
+            col = plc_Column.from_libcudf(
                 move(
                     make_numeric_column(
                         new_dtype, col.size(), libcudf_types.mask_state.ALL_NULL
diff --git a/python/cudf/cudf/_lib/copying.pxd b/python/cudf/cudf/_lib/copying.pxd
index 599b9c5a067..8fc7f4e1da0 100644
--- a/python/cudf/cudf/_lib/copying.pxd
+++ b/python/cudf/cudf/_lib/copying.pxd
@@ -1,6 +1,6 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.contiguous_split cimport packed_columns
+from cudf._lib.pylibcudf.libcudf.contiguous_split cimport packed_columns
 
 
 cdef class _CPackedColumns:
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 6a52af520f0..796c70e615c 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -26,15 +26,17 @@ from cudf.core.abc import Serializable
 
 from libcpp.memory cimport make_unique
 
-cimport cudf._lib.cpp.contiguous_split as cpp_contiguous_split
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.lists.gather cimport (
+cimport cudf._lib.pylibcudf.libcudf.contiguous_split as cpp_contiguous_split
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.lists.gather cimport (
     segmented_gather as cpp_segmented_gather,
 )
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.utils cimport columns_from_pylibcudf_table, data_from_table_view
 
 # workaround for https://github.com/cython/cython/issues/3885
diff --git a/python/cudf/cudf/_lib/cpp/lists/count_elements.pxd b/python/cudf/cudf/_lib/cpp/lists/count_elements.pxd
deleted file mode 100644
index 9be38f26237..00000000000
--- a/python/cudf/cudf/_lib/cpp/lists/count_elements.pxd
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-
-
-cdef extern from "cudf/lists/count_elements.hpp" namespace "cudf::lists" nogil:
-    cdef unique_ptr[column] count_elements(const lists_column_view) except +
diff --git a/python/cudf/cudf/_lib/cpp/lists/explode.pxd b/python/cudf/cudf/_lib/cpp/lists/explode.pxd
deleted file mode 100644
index c3e15dd203c..00000000000
--- a/python/cudf/cudf/_lib/cpp/lists/explode.pxd
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
-
-
-cdef extern from "cudf/lists/explode.hpp" namespace "cudf" nogil:
-    cdef unique_ptr[table] explode_outer(
-        const table_view,
-        size_type explode_column_idx,
-    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/extract.pxd b/python/cudf/cudf/_lib/cpp/strings/extract.pxd
deleted file mode 100644
index 384f0f0ef42..00000000000
--- a/python/cudf/cudf/_lib/cpp/strings/extract.pxd
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.regex_program cimport regex_program
-from cudf._lib.cpp.table.table cimport table
-
-
-cdef extern from "cudf/strings/extract.hpp" namespace "cudf::strings" nogil:
-
-    cdef unique_ptr[table] extract(
-        column_view source_strings,
-        regex_program) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/findall.pxd b/python/cudf/cudf/_lib/cpp/strings/findall.pxd
deleted file mode 100644
index 8c878ada097..00000000000
--- a/python/cudf/cudf/_lib/cpp/strings/findall.pxd
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.regex_program cimport regex_program
-
-
-cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil:
-
-    cdef unique_ptr[column] findall(
-        column_view source_strings,
-        regex_program) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/strip.pxd b/python/cudf/cudf/_lib/cpp/strings/strip.pxd
deleted file mode 100644
index 3a86f80328f..00000000000
--- a/python/cudf/cudf/_lib/cpp/strings/strip.pxd
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.side_type cimport side_type
-
-
-cdef extern from "cudf/strings/strip.hpp" namespace "cudf::strings" nogil:
-
-    cdef unique_ptr[column] strip(
-        column_view source_strings,
-        side_type stype,
-        string_scalar to_strip) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/wrap.pxd b/python/cudf/cudf/_lib/cpp/strings/wrap.pxd
deleted file mode 100644
index 62c791799ad..00000000000
--- a/python/cudf/cudf/_lib/cpp/strings/wrap.pxd
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
-
-
-cdef extern from "cudf/strings/wrap.hpp" namespace "cudf::strings" nogil:
-
-    cdef unique_ptr[column] wrap(
-        column_view source_strings,
-        size_type width) except +
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index b2e4d442bd2..aa771295607 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -7,9 +7,9 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.cpp.types cimport data_type
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
 from cudf._lib.io.datasource cimport Datasource, NativeFileDatasource
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 from cudf._lib.types cimport dtype_to_data_type
 
 import numpy as np
@@ -18,7 +18,7 @@ import pandas as pd
 import cudf
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 import errno
 import os
@@ -29,22 +29,22 @@ from io import BytesIO, StringIO
 from libc.stdint cimport int32_t
 from libcpp cimport bool
 
-from cudf._lib.cpp.io.csv cimport (
+from cudf._lib.io.utils cimport make_sink_info, make_source_info
+from cudf._lib.pylibcudf.libcudf.io.csv cimport (
     csv_reader_options,
     csv_writer_options,
     read_csv as cpp_read_csv,
     write_csv as cpp_write_csv,
 )
-from cudf._lib.cpp.io.data_sink cimport data_sink
-from cudf._lib.cpp.io.types cimport (
+from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
     compression_type,
     quote_style,
     sink_info,
     source_info,
     table_with_metadata,
 )
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.io.utils cimport make_sink_info, make_source_info
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
 
 from pyarrow.lib import NativeFile
diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index 009a69ea501..b30ef875a7b 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -7,13 +7,13 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-cimport cudf._lib.cpp.datetime as libcudf_datetime
+cimport cudf._lib.pylibcudf.libcudf.datetime as libcudf_datetime
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.filling cimport calendrical_month_sequence
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.filling cimport calendrical_month_sequence
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/expressions.pxd b/python/cudf/cudf/_lib/expressions.pxd
index c2ee504c626..4a20c5fc545 100644
--- a/python/cudf/cudf/_lib/expressions.pxd
+++ b/python/cudf/cudf/_lib/expressions.pxd
@@ -3,13 +3,13 @@
 from libc.stdint cimport int32_t, int64_t
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.expressions cimport (
+from cudf._lib.pylibcudf.libcudf.expressions cimport (
     column_reference,
     expression,
     literal,
     operation,
 )
-from cudf._lib.cpp.scalar.scalar cimport (
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
     numeric_scalar,
     scalar,
     string_scalar,
diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx
index a3b07075507..3fb29279ed7 100644
--- a/python/cudf/cudf/_lib/expressions.pyx
+++ b/python/cudf/cudf/_lib/expressions.pyx
@@ -10,9 +10,12 @@ from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from cudf._lib.cpp cimport expressions as libcudf_exp
-from cudf._lib.cpp.types cimport size_type
-from cudf._lib.cpp.wrappers.timestamps cimport timestamp_ms, timestamp_us
+from cudf._lib.pylibcudf.libcudf cimport expressions as libcudf_exp
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.wrappers.timestamps cimport (
+    timestamp_ms,
+    timestamp_us,
+)
 
 # Necessary for proper casting, see below.
 ctypedef int32_t underlying_type_ast_operator
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index d5e97439180..7533ed56647 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -18,8 +18,8 @@ from cudf._lib.utils cimport columns_from_pylibcudf_table
 
 from cudf._lib.scalar import as_device_scalar
 
-from cudf._lib.cpp.replace cimport replace_policy
-from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.replace cimport replace_policy
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 from cudf._lib import pylibcudf
 from cudf._lib.aggregation import make_aggregation
diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
index 6854cff7763..b8331d5a226 100644
--- a/python/cudf/cudf/_lib/hash.pyx
+++ b/python/cudf/cudf/_lib/hash.pyx
@@ -7,10 +7,10 @@ from libcpp.pair cimport pair
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.types as libcudf_types
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.hash cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.hash cimport (
     md5,
     murmurhash3_x86_32,
     sha1,
@@ -20,9 +20,11 @@ from cudf._lib.cpp.hash cimport (
     sha512,
     xxhash_64,
 )
-from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.partitioning cimport (
+    hash_partition as cpp_hash_partition,
+)
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index 0afed1bbd2e..37595b65e65 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -6,13 +6,13 @@ from libcpp.utility cimport move
 
 from cudf._lib import pylibcudf
 
-from cudf._lib.cpp.interop cimport (
+from cudf._lib.pylibcudf.libcudf.interop cimport (
     DLManagedTensor,
     from_dlpack as cpp_from_dlpack,
     to_dlpack as cpp_to_dlpack,
 )
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.utils cimport (
     columns_from_pylibcudf_table,
     columns_from_unique_ptr,
diff --git a/python/cudf/cudf/_lib/io/datasource.pxd b/python/cudf/cudf/_lib/io/datasource.pxd
index bd5bf0227a5..a0a9c3fa0d4 100644
--- a/python/cudf/cudf/_lib/io/datasource.pxd
+++ b/python/cudf/cudf/_lib/io/datasource.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport shared_ptr
 
-from cudf._lib.cpp.io.arrow_io_source cimport arrow_io_source
-from cudf._lib.cpp.io.datasource cimport datasource
+from cudf._lib.pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
+from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
 
 
 cdef class Datasource:
diff --git a/python/cudf/cudf/_lib/io/datasource.pyx b/python/cudf/cudf/_lib/io/datasource.pyx
index 5cadd58d8d3..aa7fa0efdaf 100644
--- a/python/cudf/cudf/_lib/io/datasource.pyx
+++ b/python/cudf/cudf/_lib/io/datasource.pyx
@@ -1,11 +1,11 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport shared_ptr
 from pyarrow.includes.libarrow cimport CRandomAccessFile
 from pyarrow.lib cimport NativeFile
 
-from cudf._lib.cpp.io.arrow_io_source cimport arrow_io_source
-from cudf._lib.cpp.io.datasource cimport datasource
+from cudf._lib.pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
+from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
 
 
 cdef class Datasource:
diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd
index 2c2d52b512b..252d986843a 100644
--- a/python/cudf/cudf/_lib/io/utils.pxd
+++ b/python/cudf/cudf/_lib/io/utils.pxd
@@ -1,11 +1,15 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.io.data_sink cimport data_sink
-from cudf._lib.cpp.io.types cimport column_name_info, sink_info, source_info
+from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
+    column_name_info,
+    sink_info,
+    source_info,
+)
 
 
 cdef source_info make_source_info(list src) except*
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index ae978d18813..3c14ec46122 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -8,15 +8,15 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.io.data_sink cimport data_sink
-from cudf._lib.cpp.io.datasource cimport datasource
-from cudf._lib.cpp.io.types cimport (
+from cudf._lib.io.datasource cimport Datasource
+from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
+from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
     column_name_info,
     host_buffer,
     sink_info,
     source_info,
 )
-from cudf._lib.io.datasource cimport Datasource
 
 import codecs
 import errno
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index cef71ed24a5..283a451dd4a 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -14,30 +14,30 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.io.data_sink cimport data_sink
-from cudf._lib.cpp.io.json cimport (
+from cudf._lib.io.utils cimport (
+    make_sink_info,
+    make_source_info,
+    update_struct_field_names,
+)
+from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
+from cudf._lib.pylibcudf.libcudf.io.json cimport (
     json_reader_options,
     json_writer_options,
     read_json as libcudf_read_json,
     schema_element,
     write_json as libcudf_write_json,
 )
-from cudf._lib.cpp.io.types cimport (
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
     column_name_info,
     compression_type,
     sink_info,
     table_metadata,
     table_with_metadata,
 )
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport data_type, size_type
-from cudf._lib.io.utils cimport (
-    make_sink_info,
-    make_source_info,
-    update_struct_field_names,
-)
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 from cudf._lib.types cimport dtype_to_data_type
 from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
 
diff --git a/python/cudf/cudf/_lib/labeling.pyx b/python/cudf/cudf/_lib/labeling.pyx
index 2c2538ab0af..439a727a9ca 100644
--- a/python/cudf/cudf/_lib/labeling.pyx
+++ b/python/cudf/cudf/_lib/labeling.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -7,9 +7,12 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.labeling cimport inclusive, label_bins as cpp_label_bins
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.labeling cimport (
+    inclusive,
+    label_bins as cpp_label_bins,
+)
 
 
 # Note that the parameter input shadows a Python built-in in the local scope,
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index f4d16967300..656d92c1a4b 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -7,24 +7,33 @@ from libcpp.memory cimport make_shared, shared_ptr, unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.lists.combine cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
     concatenate_list_elements as cpp_concatenate_list_elements,
     concatenate_null_policy,
     concatenate_rows as cpp_concatenate_rows,
 )
-from cudf._lib.cpp.lists.contains cimport contains, index_of as cpp_index_of
-from cudf._lib.cpp.lists.count_elements cimport (
+from cudf._lib.pylibcudf.libcudf.lists.contains cimport (
+    contains,
+    index_of as cpp_index_of,
+)
+from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport (
     count_elements as cpp_count_elements,
 )
-from cudf._lib.cpp.lists.extract cimport extract_list_element
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-from cudf._lib.cpp.lists.sorting cimport sort_lists as cpp_sort_lists
-from cudf._lib.cpp.lists.stream_compaction cimport distinct as cpp_distinct
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.lists.extract cimport extract_list_element
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.lists.sorting cimport (
+    sort_lists as cpp_sort_lists,
+)
+from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport (
+    distinct as cpp_distinct,
+)
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport (
     nan_equality,
     null_equality,
     null_order,
diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx
index 1f98140d9e4..b00deae2270 100644
--- a/python/cudf/cudf/_lib/null_mask.pyx
+++ b/python/cudf/cudf/_lib/null_mask.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from enum import Enum
 
@@ -11,8 +11,8 @@ from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.null_mask cimport (
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.null_mask cimport (
     bitmask_allocation_size_bytes as cpp_bitmask_allocation_size_bytes,
     bitmask_and as cpp_bitmask_and,
     bitmask_or as cpp_bitmask_or,
@@ -20,8 +20,8 @@ from cudf._lib.cpp.null_mask cimport (
     create_null_mask as cpp_create_null_mask,
     underlying_type_t_mask_state,
 )
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport mask_state, size_type
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport mask_state, size_type
 from cudf._lib.utils cimport table_view_from_columns
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
index cfc76afa8a5..d60162d0656 100644
--- a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
+++ b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 
 from cudf.core.buffer import acquire_spill_lock
@@ -7,14 +7,14 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.byte_pair_encode cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.byte_pair_encode cimport (
     bpe_merge_pairs as cpp_bpe_merge_pairs,
     byte_pair_encoding as cpp_byte_pair_encoding,
     load_merge_pairs as cpp_load_merge_pairs,
 )
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
index 984c8e84d7c..514b6610575 100644
--- a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
+++ b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,9 +6,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.edit_distance cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.edit_distance cimport (
     edit_distance as cpp_edit_distance,
     edit_distance_matrix as cpp_edit_distance_matrix,
 )
diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
index 96b95c8792d..a6b9a1e4f7a 100644
--- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
+++ b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,15 +6,15 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.generate_ngrams cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.generate_ngrams cimport (
     generate_character_ngrams as cpp_generate_character_ngrams,
     generate_ngrams as cpp_generate_ngrams,
     hash_character_ngrams as cpp_hash_character_ngrams,
 )
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/jaccard.pyx b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
index 9035e743fa5..42fe15d6869 100644
--- a/python/cudf/cudf/_lib/nvtext/jaccard.pyx
+++ b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,10 +6,12 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.jaccard cimport jaccard_index as cpp_jaccard_index
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.jaccard cimport (
+    jaccard_index as cpp_jaccard_index,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
index 6ed5ca834ee..4c92999e190 100644
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,13 +6,13 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.minhash cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.minhash cimport (
     minhash as cpp_minhash,
     minhash64 as cpp_minhash64,
 )
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
index 3e7911c8ae8..ccd8de8c96f 100644
--- a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,13 +6,13 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.ngrams_tokenize cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.ngrams_tokenize cimport (
     ngrams_tokenize as cpp_ngrams_tokenize,
 )
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/normalize.pyx b/python/cudf/cudf/_lib/nvtext/normalize.pyx
index 80c6ef792ab..9f81f865bb7 100644
--- a/python/cudf/cudf/_lib/nvtext/normalize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/normalize.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -7,9 +7,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.normalize cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.normalize cimport (
     normalize_characters as cpp_normalize_characters,
     normalize_spaces as cpp_normalize_spaces,
 )
diff --git a/python/cudf/cudf/_lib/nvtext/replace.pyx b/python/cudf/cudf/_lib/nvtext/replace.pyx
index 289e5611010..ce2edc58d19 100644
--- a/python/cudf/cudf/_lib/nvtext/replace.pyx
+++ b/python/cudf/cudf/_lib/nvtext/replace.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,14 +6,14 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.replace cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.replace cimport (
     filter_tokens as cpp_filter_tokens,
     replace_tokens as cpp_replace_tokens,
 )
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/stemmer.pyx b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
index c690aba70de..8f75953ae99 100644
--- a/python/cudf/cudf/_lib/nvtext/stemmer.pyx
+++ b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -8,15 +8,15 @@ from libcpp.utility cimport move
 from enum import IntEnum
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.stemmer cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.stemmer cimport (
     is_letter as cpp_is_letter,
     letter_type,
     porter_stemmer_measure as cpp_porter_stemmer_measure,
     underlying_type_t_letter_type,
 )
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 class LetterType(IntEnum):
diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
index bf675a16adc..1112667a087 100644
--- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint32_t
 
@@ -10,8 +10,8 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.subword_tokenize cimport (
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.subword_tokenize cimport (
     hashed_vocabulary as cpp_hashed_vocabulary,
     load_vocabulary_file as cpp_load_vocabulary_file,
     move as tr_move,
diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
index bee9d6f6c4d..98afd94ab1c 100644
--- a/python/cudf/cudf/_lib/nvtext/tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,9 +6,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.tokenize cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.tokenize cimport (
     character_tokenize as cpp_character_tokenize,
     count_tokens as cpp_count_tokens,
     detokenize as cpp_detokenize,
@@ -17,8 +17,8 @@ from cudf._lib.cpp.nvtext.tokenize cimport (
     tokenize_vocabulary as cpp_tokenize_vocabulary,
     tokenize_with_vocabulary as cpp_tokenize_with_vocabulary,
 )
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 918880648bf..d3e6053ef4b 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -14,17 +14,23 @@ from libcpp.vector cimport vector
 import datetime
 from collections import OrderedDict
 
-cimport cudf._lib.cpp.lists.lists_column_view as cpp_lists_column_view
+cimport cudf._lib.pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view
 
 try:
     import ujson as json
 except ImportError:
     import json
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.io.data_sink cimport data_sink
-from cudf._lib.cpp.io.orc cimport (
+from cudf._lib.io.datasource cimport NativeFileDatasource
+from cudf._lib.io.utils cimport (
+    make_sink_info,
+    make_source_info,
+    update_column_struct_field_names,
+)
+from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
+from cudf._lib.pylibcudf.libcudf.io.orc cimport (
     chunked_orc_writer_options,
     orc_chunked_writer,
     orc_reader_options,
@@ -32,7 +38,7 @@ from cudf._lib.cpp.io.orc cimport (
     read_orc as libcudf_read_orc,
     write_orc as libcudf_write_orc,
 )
-from cudf._lib.cpp.io.orc_metadata cimport (
+from cudf._lib.pylibcudf.libcudf.io.orc_metadata cimport (
     binary_statistics,
     bucket_statistics,
     column_statistics,
@@ -47,7 +53,7 @@ from cudf._lib.cpp.io.orc_metadata cimport (
     string_statistics,
     timestamp_statistics,
 )
-from cudf._lib.cpp.io.types cimport (
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
     column_in_metadata,
     compression_type,
     sink_info,
@@ -55,14 +61,8 @@ from cudf._lib.cpp.io.types cimport (
     table_input_metadata,
     table_with_metadata,
 )
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport data_type, size_type, type_id
-from cudf._lib.io.datasource cimport NativeFileDatasource
-from cudf._lib.io.utils cimport (
-    make_sink_info,
-    make_source_info,
-    update_column_struct_field_names,
-)
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type, type_id
 from cudf._lib.variant cimport get_if as std_get_if, holds_alternative
 
 from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index dcfa087a1fa..4a23a58b523 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -31,12 +31,19 @@ from libcpp.unordered_map cimport unordered_map
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.data_sink as cudf_io_data_sink
-cimport cudf._lib.cpp.io.types as cudf_io_types
-cimport cudf._lib.cpp.types as cudf_types
+cimport cudf._lib.pylibcudf.libcudf.io.data_sink as cudf_io_data_sink
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+cimport cudf._lib.pylibcudf.libcudf.types as cudf_types
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.expressions cimport expression
-from cudf._lib.cpp.io.parquet cimport (
+from cudf._lib.expressions cimport Expression
+from cudf._lib.io.datasource cimport NativeFileDatasource
+from cudf._lib.io.utils cimport (
+    make_sinks_info,
+    make_source_info,
+    update_struct_field_names,
+)
+from cudf._lib.pylibcudf.libcudf.expressions cimport expression
+from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
     chunked_parquet_writer_options,
     merge_row_group_metadata as parquet_merge_metadata,
     parquet_chunked_writer as cpp_parquet_chunked_writer,
@@ -46,20 +53,16 @@ from cudf._lib.cpp.io.parquet cimport (
     read_parquet as parquet_reader,
     write_parquet as parquet_writer,
 )
-from cudf._lib.cpp.io.parquet_metadata cimport (
+from cudf._lib.pylibcudf.libcudf.io.parquet_metadata cimport (
     parquet_metadata,
     read_parquet_metadata as parquet_metadata_reader,
 )
-from cudf._lib.cpp.io.types cimport column_in_metadata, table_input_metadata
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport data_type, size_type
-from cudf._lib.expressions cimport Expression
-from cudf._lib.io.datasource cimport NativeFileDatasource
-from cudf._lib.io.utils cimport (
-    make_sinks_info,
-    make_source_info,
-    update_struct_field_names,
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
+    column_in_metadata,
+    table_input_metadata,
 )
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 from cudf._lib.utils cimport table_view_from_table
 
 from pyarrow.lib import NativeFile
diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx
index 4bf8b32ea7e..708ec4174aa 100644
--- a/python/cudf/cudf/_lib/partitioning.pyx
+++ b/python/cudf/cudf/_lib/partitioning.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -8,16 +8,18 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.partitioning cimport partition as cpp_partition
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.partitioning cimport (
+    partition as cpp_partition,
+)
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 from cudf._lib.reduce import minmax
 from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count
 
-cimport cudf._lib.cpp.types as libcudf_types
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index c2b7cb7ca3d..efc978fc6d0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -45,4 +45,5 @@ rapids_cython_create_modules(
 )
 link_to_pyarrow_headers(pylibcudf_interop)
 
+add_subdirectory(libcudf)
 add_subdirectory(strings)
diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd
index a9491793b88..8526728656b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd
@@ -2,7 +2,7 @@
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.aggregation cimport (
+from cudf._lib.pylibcudf.libcudf.aggregation cimport (
     Kind as kind_t,
     aggregation,
     correlation_type,
@@ -14,7 +14,7 @@ from cudf._lib.cpp.aggregation cimport (
     rolling_aggregation,
     scan_aggregation,
 )
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.types cimport (
     interpolation,
     nan_equality,
     null_equality,
diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
index fe7daea38bf..672b1ba2221 100644
--- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
@@ -5,7 +5,7 @@ from libcpp.cast cimport dynamic_cast
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp.aggregation cimport (
+from cudf._lib.pylibcudf.libcudf.aggregation cimport (
     aggregation,
     correlation_type,
     groupby_aggregation,
@@ -39,7 +39,7 @@ from cudf._lib.cpp.aggregation cimport (
     rolling_aggregation,
     scan_aggregation,
 )
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.types cimport (
     interpolation,
     nan_equality,
     null_equality,
@@ -49,14 +49,16 @@ from cudf._lib.cpp.types cimport (
     size_type,
 )
 
-from cudf._lib.cpp.aggregation import Kind  # no-cython-lint
-from cudf._lib.cpp.aggregation import \
+from cudf._lib.pylibcudf.libcudf.aggregation import Kind  # no-cython-lint
+from cudf._lib.pylibcudf.libcudf.aggregation import \
     correlation_type as CorrelationType  # no-cython-lint
-from cudf._lib.cpp.aggregation import \
+from cudf._lib.pylibcudf.libcudf.aggregation import \
     rank_method as RankMethod  # no-cython-lint
-from cudf._lib.cpp.aggregation import \
+from cudf._lib.pylibcudf.libcudf.aggregation import \
     rank_percentage as RankPercentage  # no-cython-lint
-from cudf._lib.cpp.aggregation import udf_type as UdfType  # no-cython-lint
+from cudf._lib.pylibcudf.libcudf.aggregation import (  # no-cython-lint
+    udf_type as UdfType,
+)
 
 from .types cimport DataType
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd b/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
index 0aa6aac7b39..9a8c8e49dcf 100644
--- a/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.binaryop cimport binary_operator
+from cudf._lib.pylibcudf.libcudf.binaryop cimport binary_operator
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx b/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
index 16de7757469..c1d669c3c1c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
@@ -5,11 +5,11 @@ from cython.operator import dereference
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp cimport binaryop as cpp_binaryop
-from cudf._lib.cpp.binaryop cimport binary_operator
-from cudf._lib.cpp.column.column cimport column
+from cudf._lib.pylibcudf.libcudf cimport binaryop as cpp_binaryop
+from cudf._lib.pylibcudf.libcudf.binaryop cimport binary_operator
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
 
-from cudf._lib.cpp.binaryop import \
+from cudf._lib.pylibcudf.libcudf.binaryop import \
     binary_operator as BinaryOperator  # no-cython-lint
 
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd
index 66ccdb53d1a..e121e856865 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pxd
@@ -3,9 +3,12 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.types cimport bitmask_type, size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport bitmask_type, size_type
 
 from .gpumemoryview cimport gpumemoryview
 from .types cimport DataType
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
index b9e5e48226d..e726eca154f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -6,10 +6,12 @@ from libcpp.utility cimport move
 
 from rmm._lib.device_buffer cimport DeviceBuffer
 
-from cudf._lib.cpp.column.column cimport column, column_contents
-from cudf._lib.cpp.column.column_factories cimport make_column_from_scalar
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_contents
+from cudf._lib.pylibcudf.libcudf.column.column_factories cimport (
+    make_column_from_scalar,
+)
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .gpumemoryview cimport gpumemoryview
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/concatenate.pyx b/python/cudf/cudf/_lib/pylibcudf/concatenate.pyx
index ce7ef84e20e..5e40f921b2c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/concatenate.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/concatenate.pyx
@@ -4,11 +4,11 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp cimport concatenate as cpp_concatenate
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf cimport concatenate as cpp_concatenate
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pxd b/python/cudf/cudf/_lib/pylibcudf/copying.pxd
index 0211d122c8e..06543d3ca92 100644
--- a/python/cudf/cudf/_lib/pylibcudf/copying.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/copying.pxd
@@ -2,8 +2,11 @@
 
 from libcpp cimport bool as cbool
 
-from cudf._lib.cpp.copying cimport mask_allocation_policy, out_of_bounds_policy
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.copying cimport (
+    mask_allocation_policy,
+    out_of_bounds_policy,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pyx b/python/cudf/cudf/_lib/pylibcudf/copying.pyx
index 125a4ffe65f..2d59deb3864 100644
--- a/python/cudf/cudf/_lib/pylibcudf/copying.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/copying.pyx
@@ -11,18 +11,24 @@ from libcpp.vector cimport vector
 # directly from that. It will make namespacing much cleaner in pylibcudf. What
 # we really want here would be
 # cimport libcudf... libcudf.copying.algo(...)
-from cudf._lib.cpp cimport copying as cpp_copying
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.copying cimport mask_allocation_policy, out_of_bounds_policy
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
-
-from cudf._lib.cpp.copying import \
+from cudf._lib.pylibcudf.libcudf cimport copying as cpp_copying
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.copying cimport (
+    mask_allocation_policy,
+    out_of_bounds_policy,
+)
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+from cudf._lib.pylibcudf.libcudf.copying import \
     mask_allocation_policy as MaskAllocationPolicy  # no-cython-lint
-from cudf._lib.cpp.copying import \
+from cudf._lib.pylibcudf.libcudf.copying import \
     out_of_bounds_policy as OutOfBoundsPolicy  # no-cython-lint
 
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/filling.pxd b/python/cudf/cudf/_lib/pylibcudf/filling.pxd
index 55dbd7b075f..3560ebf2ea2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/filling.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/filling.pxd
@@ -1,5 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/filling.pyx b/python/cudf/cudf/_lib/pylibcudf/filling.pyx
index 588ab58a146..05f67681428 100644
--- a/python/cudf/cudf/_lib/pylibcudf/filling.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/filling.pyx
@@ -4,15 +4,15 @@ from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.filling cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.filling cimport (
     fill as cpp_fill,
     fill_in_place as cpp_fill_in_place,
     repeat as cpp_repeat,
     sequence as cpp_sequence,
 )
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd
index f1b7a25d5f9..c6c146b0445 100644
--- a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd
@@ -4,18 +4,18 @@ from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.aggregation cimport (
+from cudf._lib.pylibcudf.libcudf.aggregation cimport (
     aggregation,
     groupby_aggregation,
     groupby_scan_aggregation,
 )
-from cudf._lib.cpp.groupby cimport (
+from cudf._lib.pylibcudf.libcudf.groupby cimport (
     aggregation_request,
     aggregation_result,
     groupby,
     scan_request,
 )
-from cudf._lib.cpp.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx
index 3b800abf266..46fe61025ce 100644
--- a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx
@@ -7,17 +7,17 @@ from libcpp.pair cimport pair
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.groupby cimport (
+from cudf._lib.pylibcudf.libcudf.groupby cimport (
     aggregation_request,
     aggregation_result,
     groupby,
     groups,
     scan_request,
 )
-from cudf._lib.cpp.replace cimport replace_policy
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.replace cimport replace_policy
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .aggregation cimport Aggregation
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
index 8dc41fccc0c..f172080cece 100644
--- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
@@ -11,14 +11,17 @@ from functools import singledispatch
 
 from pyarrow import lib as pa
 
-from cudf._lib.cpp.interop cimport (
+from cudf._lib.pylibcudf.libcudf.interop cimport (
     column_metadata,
     from_arrow as cpp_from_arrow,
     to_arrow as cpp_to_arrow,
 )
-from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.wrappers.decimals cimport (
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
+    fixed_point_scalar,
+    scalar,
+)
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.wrappers.decimals cimport (
     decimal32,
     decimal64,
     decimal128,
diff --git a/python/cudf/cudf/_lib/pylibcudf/join.pxd b/python/cudf/cudf/_lib/pylibcudf/join.pxd
index ff7dec97596..f560eeef06d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/join.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/join.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.types cimport null_equality
+from cudf._lib.pylibcudf.libcudf.types cimport null_equality
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/join.pyx b/python/cudf/cudf/_lib/pylibcudf/join.pyx
index 3710a84e594..cf2a6a8187f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/join.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/join.pyx
@@ -7,9 +7,14 @@ from libcpp.utility cimport move
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.cpp cimport join as cpp_join
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.types cimport data_type, null_equality, size_type, type_id
+from cudf._lib.pylibcudf.libcudf cimport join as cpp_join
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    data_type,
+    null_equality,
+    size_type,
+    type_id,
+)
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/cpp/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/CMakeLists.txt
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
diff --git a/python/cudf/cudf/_lib/cpp/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
similarity index 98%
rename from python/cudf/cudf/_lib/cpp/aggregation.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
index 91b9d7d024f..e0e01207589 100644
--- a/python/cudf/cudf/_lib/cpp/aggregation.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
@@ -5,7 +5,7 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.types cimport (
     data_type,
     interpolation,
     nan_equality,
diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/aggregation.pyx
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pyx
diff --git a/python/cudf/cudf/_lib/cpp/binaryop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
similarity index 84%
rename from python/cudf/cudf/_lib/cpp/binaryop.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
index 735216e656a..788a94a0bbc 100644
--- a/python/cudf/cudf/_lib/cpp/binaryop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
@@ -4,10 +4,10 @@ from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/binaryop.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/binaryop.pyx
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pyx
diff --git a/python/cudf/cudf/_lib/cpp/column/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/column/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/column/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/column/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/column/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/column/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/column/column.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column.pxd
similarity index 78%
rename from python/cudf/cudf/_lib/cpp/column/column.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/column/column.pxd
index 136f1d795a9..dd184d31cc6 100644
--- a/python/cudf/cudf/_lib/cpp/column/column.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -6,8 +6,11 @@ from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.types cimport data_type, size_type
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 
 
 cdef extern from "cudf/column/column.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/column/column_factories.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd
similarity index 65%
rename from python/cudf/cudf/_lib/cpp/column/column_factories.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd
index 0f22e788bd7..fd22d92cb30 100644
--- a/python/cudf/cudf/_lib/cpp/column/column_factories.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport data_type, mask_state, size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, mask_state, size_type
 
 
 cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/column/column_view.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_view.pxd
similarity index 95%
rename from python/cudf/cudf/_lib/cpp/column/column_view.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_view.pxd
index edd013d9340..c6403babe89 100644
--- a/python/cudf/cudf/_lib/cpp/column/column_view.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_view.pxd
@@ -1,9 +1,13 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.types cimport bitmask_type, data_type, size_type
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    bitmask_type,
+    data_type,
+    size_type,
+)
 
 
 cdef extern from "cudf/column/column_view.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/concatenate.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/concatenate.pxd
similarity index 77%
rename from python/cudf/cudf/_lib/cpp/concatenate.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/concatenate.pxd
index a64c7426f5e..0c362390ff2 100644
--- a/python/cudf/cudf/_lib/cpp/concatenate.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/concatenate.pxd
@@ -5,9 +5,9 @@ from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.cpp.column.column cimport column, column_view
-from cudf._lib.cpp.table.table cimport table, table_view
-from cudf._lib.cpp.utilities.host_span cimport host_span
+from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table, table_view
+from cudf._lib.pylibcudf.libcudf.utilities.host_span cimport host_span
 
 
 cdef extern from "cudf/concatenate.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/contiguous_split.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/contiguous_split.pxd
similarity index 80%
rename from python/cudf/cudf/_lib/cpp/contiguous_split.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/contiguous_split.pxd
index 134e4ed0723..b06feacb016 100644
--- a/python/cudf/cudf/_lib/cpp/contiguous_split.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/contiguous_split.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t
 from libcpp.memory cimport unique_ptr
@@ -6,8 +6,8 @@ from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/contiguous_split.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd
similarity index 91%
rename from python/cudf/cudf/_lib/cpp/copying.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd
index 053e2299f22..001489d69bf 100644
--- a/python/cudf/cudf/_lib/cpp/copying.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd
@@ -8,13 +8,16 @@ from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
 from cudf._lib.exception_handler cimport cudf_exception_handler
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 ctypedef const scalar constscalar
 
diff --git a/python/cudf/cudf/_lib/cpp/copying.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/copying.pyx
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pyx
diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/datetime.pxd
similarity index 90%
rename from python/cudf/cudf/_lib/cpp/datetime.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/datetime.pxd
index d03587745e1..7db77b9c7c5 100644
--- a/python/cudf/cudf/_lib/cpp/datetime.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/datetime.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/expressions.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd
similarity index 92%
rename from python/cudf/cudf/_lib/cpp/expressions.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd
index 291afbcc62a..279d969db50 100644
--- a/python/cudf/cudf/_lib/cpp/expressions.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd
@@ -1,16 +1,16 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.scalar.scalar cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
     duration_scalar,
     numeric_scalar,
     timestamp_scalar,
 )
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/filling.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/filling.pxd
similarity index 67%
rename from python/cudf/cudf/_lib/cpp/filling.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/filling.pxd
index e412f294537..16ed682f930 100644
--- a/python/cudf/cudf/_lib/cpp/filling.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/filling.pxd
@@ -1,14 +1,17 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/filling.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/groupby.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/groupby.pxd
similarity index 83%
rename from python/cudf/cudf/_lib/cpp/groupby.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/groupby.pxd
index 8bbefcde0dd..16607cc3711 100644
--- a/python/cudf/cudf/_lib/cpp/groupby.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/groupby.pxd
@@ -6,24 +6,24 @@ from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.aggregation cimport (
+from cudf._lib.pylibcudf.libcudf.aggregation cimport (
     groupby_aggregation,
     groupby_scan_aggregation,
 )
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.replace cimport replace_policy
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.replace cimport replace_policy
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport (
     null_order,
     null_policy,
     order,
     size_type,
     sorted,
 )
-from cudf._lib.cpp.utilities.host_span cimport host_span
+from cudf._lib.pylibcudf.libcudf.utilities.host_span cimport host_span
 
 # workaround for https://github.com/cython/cython/issues/3885
 ctypedef const scalar constscalar
diff --git a/python/cudf/cudf/_lib/cpp/hash.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/hash.pxd
similarity index 86%
rename from python/cudf/cudf/_lib/cpp/hash.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/hash.pxd
index d55e244dc2c..5346252df69 100644
--- a/python/cudf/cudf/_lib/cpp/hash.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/hash.pxd
@@ -4,9 +4,9 @@ from libc.stdint cimport uint32_t, uint64_t
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
similarity index 83%
rename from python/cudf/cudf/_lib/cpp/interop.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
index 88e9d83ee98..471b78505fb 100644
--- a/python/cudf/cudf/_lib/cpp/interop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
@@ -7,9 +7,9 @@ from pyarrow.lib cimport CScalar, CTable
 
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "dlpack/dlpack.h" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/io/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/io/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/io/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/io/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/arrow_io_source.pxd
similarity index 77%
rename from python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/arrow_io_source.pxd
index 4aef4841844..1d2138f8d10 100644
--- a/python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/arrow_io_source.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport shared_ptr
 from libcpp.string cimport string
 from pyarrow.includes.libarrow cimport CRandomAccessFile
 
-cimport cudf._lib.cpp.io.datasource as cudf_io_datasource
+cimport cudf._lib.pylibcudf.libcudf.io.datasource as cudf_io_datasource
 
 
 cdef extern from "cudf/io/arrow_io_source.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/io/avro.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/avro.pxd
similarity index 88%
rename from python/cudf/cudf/_lib/cpp/io/avro.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/avro.pxd
index 9b683e5bce3..530df5aa8f1 100644
--- a/python/cudf/cudf/_lib/cpp/io/avro.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/avro.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
-from cudf._lib.cpp.types cimport size_type
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/io/avro.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/io/csv.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd
similarity index 97%
rename from python/cudf/cudf/_lib/cpp/io/csv.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd
index e7c0fec2e3d..754dd37d53f 100644
--- a/python/cudf/cudf/_lib/cpp/io/csv.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
@@ -7,9 +7,9 @@ from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
-cimport cudf._lib.cpp.table.table_view as cudf_table_view
-from cudf._lib.cpp.types cimport data_type, size_type
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 
 
 cdef extern from "cudf/io/csv.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/io/data_sink.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/data_sink.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/io/data_sink.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/data_sink.pxd
diff --git a/python/cudf/cudf/_lib/cpp/io/datasource.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/datasource.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/io/datasource.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/datasource.pxd
diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
similarity index 96%
rename from python/cudf/cudf/_lib/cpp/io/json.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
index 1e1057beede..7e64a4cae29 100644
--- a/python/cudf/cudf/_lib/cpp/io/json.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
@@ -7,9 +7,9 @@ from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
-cimport cudf._lib.cpp.table.table_view as cudf_table_view
-from cudf._lib.cpp.types cimport data_type, size_type
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 
 
 cdef extern from "cudf/io/json.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd
similarity index 97%
rename from python/cudf/cudf/_lib/cpp/io/orc.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd
index d5bb1726a43..e553515dfdf 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd
@@ -8,9 +8,9 @@ from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
-cimport cudf._lib.cpp.table.table_view as cudf_table_view
-from cudf._lib.cpp.types cimport data_type, size_type
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 
 
 cdef extern from "cudf/io/orc.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/io/orc_metadata.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc_metadata.pxd
similarity index 96%
rename from python/cudf/cudf/_lib/cpp/io/orc_metadata.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc_metadata.pxd
index aad4f1c6870..a23655b06f8 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc_metadata.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc_metadata.pxd
@@ -6,7 +6,7 @@ from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
 from cudf._lib.variant cimport monostate, variant
 
 
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
similarity index 97%
rename from python/cudf/cudf/_lib/cpp/io/parquet.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
index 1680eb43700..b7f3f89f71c 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
@@ -9,10 +9,10 @@ from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
-cimport cudf._lib.cpp.table.table_view as cudf_table_view
-from cudf._lib.cpp.expressions cimport expression
-from cudf._lib.cpp.types cimport data_type, size_type
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
+from cudf._lib.pylibcudf.libcudf.expressions cimport expression
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 
 
 cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet_metadata.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet_metadata.pxd
similarity index 89%
rename from python/cudf/cudf/_lib/cpp/io/parquet_metadata.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet_metadata.pxd
index e9def2aea5d..34a299b73ab 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet_metadata.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet_metadata.pxd
@@ -5,8 +5,8 @@ from libcpp.string cimport string
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
-from cudf._lib.cpp.types cimport size_type
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/io/parquet_metadata.hpp" namespace "cudf::io" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/io/text.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/text.pxd
similarity index 93%
rename from python/cudf/cudf/_lib/cpp/io/text.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/text.pxd
index 368b014ea4b..bec223d4079 100644
--- a/python/cudf/cudf/_lib/cpp/io/text.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/text.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint64_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
 
 
 cdef extern from "cudf/io/text/byte_range_info.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/io/timezone.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/timezone.pxd
similarity index 75%
rename from python/cudf/cudf/_lib/cpp/io/timezone.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/timezone.pxd
index 927c2118473..88cb5544dc1 100644
--- a/python/cudf/cudf/_lib/cpp/io/timezone.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/timezone.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.optional cimport optional
 from libcpp.string cimport string
 
-from cudf._lib.cpp.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
 
 
 cdef extern from "cudf/timezone.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
similarity index 91%
rename from python/cudf/cudf/_lib/cpp/io/types.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
index d8cc329b0a0..4725c4e5937 100644
--- a/python/cudf/cudf/_lib/cpp/io/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
@@ -10,11 +10,11 @@ from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
 from pyarrow.includes.libarrow cimport CRandomAccessFile
 
-cimport cudf._lib.cpp.io.data_sink as cudf_io_data_sink
-cimport cudf._lib.cpp.io.datasource as cudf_io_datasource
-cimport cudf._lib.cpp.table.table_view as cudf_table_view
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport size_type
+cimport cudf._lib.pylibcudf.libcudf.io.data_sink as cudf_io_data_sink
+cimport cudf._lib.pylibcudf.libcudf.io.datasource as cudf_io_datasource
+cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/io/types.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/join.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd
similarity index 87%
rename from python/cudf/cudf/_lib/cpp/join.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd
index 7508052646a..89a30f0f255 100644
--- a/python/cudf/cudf/_lib/cpp/join.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd
@@ -7,10 +7,10 @@ from libcpp.vector cimport vector
 
 from rmm._lib.device_uvector cimport device_uvector
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport null_equality, size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport null_equality, size_type
 
 ctypedef unique_ptr[device_uvector[size_type]] gather_map_type
 ctypedef pair[gather_map_type, gather_map_type] gather_map_pair_type
diff --git a/python/cudf/cudf/_lib/cpp/labeling.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/labeling.pxd
similarity index 71%
rename from python/cudf/cudf/_lib/cpp/labeling.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/labeling.pxd
index af9c4bb9a04..54731bf29af 100644
--- a/python/cudf/cudf/_lib/cpp/labeling.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/labeling.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/labeling/label_bins.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/lists/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/lists/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/lists/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/lists/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/lists/combine.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/combine.pxd
similarity index 73%
rename from python/cudf/cudf/_lib/cpp/lists/combine.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/combine.pxd
index a7ad8e7ba41..728bd840f71 100644
--- a/python/cudf/cudf/_lib/cpp/lists/combine.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/combine.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/lists/combine.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/lists/contains.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
similarity index 63%
rename from python/cudf/cudf/_lib/cpp/lists/contains.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
index e86c73deed2..721679f35c7 100644
--- a/python/cudf/cudf/_lib/cpp/lists/contains.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
@@ -1,12 +1,14 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.exception_handler cimport cudf_exception_handler
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd
new file mode 100644
index 00000000000..38bdd4db0bb
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd
@@ -0,0 +1,12 @@
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+
+
+cdef extern from "cudf/lists/count_elements.hpp" namespace "cudf::lists" nogil:
+    cdef unique_ptr[column] count_elements(const lists_column_view) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/explode.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/explode.pxd
new file mode 100644
index 00000000000..622a866f593
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/explode.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+
+cdef extern from "cudf/lists/explode.hpp" namespace "cudf" nogil:
+    cdef unique_ptr[table] explode_outer(
+        const table_view,
+        size_type explode_column_idx,
+    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/lists/extract.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd
similarity index 56%
rename from python/cudf/cudf/_lib/cpp/lists/extract.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd
index 93a886d7268..caa12f41914 100644
--- a/python/cudf/cudf/_lib/cpp/lists/extract.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd
@@ -1,10 +1,12 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column, column_view
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_view
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/lists/extract.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/lists/gather.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd
similarity index 57%
rename from python/cudf/cudf/_lib/cpp/lists/gather.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd
index ea664eee82e..17b4c1877a6 100644
--- a/python/cudf/cudf/_lib/cpp/lists/gather.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd
@@ -1,9 +1,11 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
 
 
 cdef extern from "cudf/lists/gather.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/lists/lists_column_view.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/cpp/lists/lists_column_view.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
index 793f4b8750d..dbafc415e45 100644
--- a/python/cudf/cudf/_lib/cpp/lists/lists_column_view.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
@@ -1,7 +1,10 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/lists/lists_column_view.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/lists/sorting.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd
similarity index 51%
rename from python/cudf/cudf/_lib/cpp/lists/sorting.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd
index 2115885ed95..145ab41302f 100644
--- a/python/cudf/cudf/_lib/cpp/lists/sorting.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd
@@ -1,10 +1,12 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-from cudf._lib.cpp.types cimport null_order, order
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, order
 
 
 cdef extern from "cudf/lists/sorting.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/lists/stream_compaction.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd
similarity index 52%
rename from python/cudf/cudf/_lib/cpp/lists/stream_compaction.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd
index 58c1ab1dcec..22b91df7192 100644
--- a/python/cudf/cudf/_lib/cpp/lists/stream_compaction.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd
@@ -1,10 +1,12 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-from cudf._lib.cpp.types cimport nan_equality, null_equality
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport nan_equality, null_equality
 
 
 cdef extern from "cudf/lists/stream_compaction.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/merge.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/merge.pxd
similarity index 62%
rename from python/cudf/cudf/_lib/cpp/merge.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/merge.pxd
index 32fe14ac479..dacb3dc2d74 100644
--- a/python/cudf/cudf/_lib/cpp/merge.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/merge.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/merge.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/null_mask.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/null_mask.pxd
similarity index 76%
rename from python/cudf/cudf/_lib/cpp/null_mask.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/null_mask.pxd
index bd0eb684690..0cab404c05f 100644
--- a/python/cudf/cudf/_lib/cpp/null_mask.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/null_mask.pxd
@@ -1,13 +1,17 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
 from libcpp.pair cimport pair
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport bitmask_type, mask_state, size_type
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    bitmask_type,
+    mask_state,
+    size_type,
+)
 
 ctypedef int32_t underlying_type_t_mask_state
 
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/nvtext/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/nvtext/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd
similarity index 67%
rename from python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd
index e678e4e84db..033a820d2ef 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
 
 cdef extern from "nvtext/byte_pair_encoding.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/edit_distance.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/edit_distance.pxd
similarity index 67%
rename from python/cudf/cudf/_lib/cpp/nvtext/edit_distance.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/edit_distance.pxd
index 11de596ec8f..ca1f6650a5a 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/edit_distance.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/edit_distance.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "nvtext/edit_distance.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/generate_ngrams.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/generate_ngrams.pxd
similarity index 63%
rename from python/cudf/cudf/_lib/cpp/nvtext/generate_ngrams.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/generate_ngrams.pxd
index 75822054e4a..2034b1c1ee5 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/generate_ngrams.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/generate_ngrams.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/generate_ngrams.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/jaccard.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/jaccard.pxd
similarity index 52%
rename from python/cudf/cudf/_lib/cpp/nvtext/jaccard.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/jaccard.pxd
index a77f95f07ac..789a1a2c35a 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/jaccard.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/jaccard.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/jaccard.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/minhash.pxd
similarity index 63%
rename from python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/minhash.pxd
index 08b3330953e..fc5577bf3f9 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/ngrams_tokenize.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd
similarity index 50%
rename from python/cudf/cudf/_lib/cpp/nvtext/ngrams_tokenize.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd
index d716df22546..229f4d8f5a3 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/ngrams_tokenize.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/ngrams_tokenize.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/normalize.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/normalize.pxd
similarity index 66%
rename from python/cudf/cudf/_lib/cpp/nvtext/normalize.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/normalize.pxd
index f012670317a..65c63b089df 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/normalize.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/normalize.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "nvtext/normalize.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/replace.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/replace.pxd
similarity index 63%
rename from python/cudf/cudf/_lib/cpp/nvtext/replace.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/replace.pxd
index c4e5258a710..aaad28d2684 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/replace.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/replace.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/replace.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/stemmer.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/stemmer.pxd
similarity index 74%
rename from python/cudf/cudf/_lib/cpp/nvtext/stemmer.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/stemmer.pxd
index 5a92b45b6dd..040d4c9de63 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/stemmer.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/stemmer.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/stemmer.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/subword_tokenize.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
similarity index 89%
rename from python/cudf/cudf/_lib/cpp/nvtext/subword_tokenize.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
index 226fa613f2c..cce40bcd3f6 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/subword_tokenize.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
@@ -1,12 +1,12 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint16_t, uint32_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/tokenize.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/tokenize.pxd
similarity index 81%
rename from python/cudf/cudf/_lib/cpp/nvtext/tokenize.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/tokenize.pxd
index 3cc3fd6251a..721a6cabd01 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/tokenize.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/tokenize.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/tokenize.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/partitioning.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/partitioning.pxd
similarity index 64%
rename from python/cudf/cudf/_lib/cpp/partitioning.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/partitioning.pxd
index 5c58dbcc4ac..babb167d2a0 100644
--- a/python/cudf/cudf/_lib/cpp/partitioning.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/partitioning.pxd
@@ -1,15 +1,15 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/partitioning.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/quantiles.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/quantiles.pxd
similarity index 65%
rename from python/cudf/cudf/_lib/cpp/quantiles.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/quantiles.pxd
index 03fda16856c..32cfec2d4fc 100644
--- a/python/cudf/cudf/_lib/cpp/quantiles.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/quantiles.pxd
@@ -1,14 +1,14 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport (
     interpolation,
     null_order,
     order,
diff --git a/python/cudf/cudf/_lib/cpp/reduce.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/cpp/reduce.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pxd
index 9c893fe9bcb..3ae1f1a2906 100644
--- a/python/cudf/cudf/_lib/cpp/reduce.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pxd
@@ -4,11 +4,14 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport pair
 
-from cudf._lib.cpp.aggregation cimport reduce_aggregation, scan_aggregation
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.aggregation cimport (
+    reduce_aggregation,
+    scan_aggregation,
+)
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/reduction.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/reduce.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/reduce.pyx
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pyx
diff --git a/python/cudf/cudf/_lib/cpp/replace.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pxd
similarity index 85%
rename from python/cudf/cudf/_lib/cpp/replace.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pxd
index 5d57f01b816..e67efbdaba0 100644
--- a/python/cudf/cudf/_lib/cpp/replace.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pxd
@@ -5,9 +5,12 @@ from libcpp.memory cimport unique_ptr
 
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/replace.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/replace.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/replace.pyx
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pyx
diff --git a/python/cudf/cudf/_lib/cpp/reshape.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/reshape.pxd
similarity index 50%
rename from python/cudf/cudf/_lib/cpp/reshape.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/reshape.pxd
index 5b9d40aa2ad..dfd9a71c3d3 100644
--- a/python/cudf/cudf/_lib/cpp/reshape.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/reshape.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/reshape.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/rolling.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/rolling.pxd
similarity index 71%
rename from python/cudf/cudf/_lib/cpp/rolling.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/rolling.pxd
index 6b620e3a4c0..d7844f99a73 100644
--- a/python/cudf/cudf/_lib/cpp/rolling.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/rolling.pxd
@@ -4,10 +4,10 @@ from libcpp.memory cimport unique_ptr
 
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
-from cudf._lib.cpp.aggregation cimport rolling_aggregation
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.aggregation cimport rolling_aggregation
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/rolling.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/round.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd
similarity index 71%
rename from python/cudf/cudf/_lib/cpp/round.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd
index 66d76c35d72..06ff42485ea 100644
--- a/python/cudf/cudf/_lib/cpp/round.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/round.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/scalar/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/scalar/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/scalar/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/scalar/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar.pxd
similarity index 89%
rename from python/cudf/cudf/_lib/cpp/scalar/scalar.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar.pxd
index b5e9b0ba06b..662eb90096e 100644
--- a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar.pxd
@@ -1,13 +1,13 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport data_type
-from cudf._lib.cpp.wrappers.decimals cimport scale_type
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.wrappers.decimals cimport scale_type
 
 
 cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/search.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/search.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/cpp/search.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/search.pxd
index 8baef0aa1b9..e2247a1366f 100644
--- a/python/cudf/cudf/_lib/cpp/search.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/search.pxd
@@ -1,12 +1,12 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table_view cimport table_view
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/search.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/sorting.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/sorting.pxd
similarity index 86%
rename from python/cudf/cudf/_lib/cpp/sorting.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/sorting.pxd
index 86dc0f0de95..3d7d3aa2790 100644
--- a/python/cudf/cudf/_lib/cpp/sorting.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/sorting.pxd
@@ -6,12 +6,12 @@ from libcpp.vector cimport vector
 
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
-cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.cpp.aggregation cimport rank_method
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+from cudf._lib.pylibcudf.libcudf.aggregation cimport rank_method
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pxd
similarity index 88%
rename from python/cudf/cudf/_lib/cpp/stream_compaction.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pxd
index 55854a9444f..11d803e5b76 100644
--- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pxd
@@ -6,11 +6,11 @@ from libcpp.vector cimport vector
 
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport (
     nan_equality,
     nan_policy,
     null_equality,
diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/stream_compaction.pyx
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pyx
diff --git a/python/cudf/cudf/_lib/cpp/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/strings/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/strings/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/strings/attributes.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/attributes.pxd
similarity index 68%
rename from python/cudf/cudf/_lib/cpp/strings/attributes.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/attributes.pxd
index 31133b45b6d..c4d52c83663 100644
--- a/python/cudf/cudf/_lib/cpp/strings/attributes.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/attributes.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/attributes.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd
similarity index 67%
rename from python/cudf/cudf/_lib/cpp/strings/capitalize.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd
index d193a8265b1..f95d4f35566 100644
--- a/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd
@@ -1,8 +1,8 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/capitalize.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/case.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/case.pxd
similarity index 67%
rename from python/cudf/cudf/_lib/cpp/strings/case.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/case.pxd
index 01cd08c10ff..9ccd2737afe 100644
--- a/python/cudf/cudf/_lib/cpp/strings/case.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/case.pxd
@@ -1,8 +1,8 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/case.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/char_types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd
similarity index 84%
rename from python/cudf/cudf/_lib/cpp/strings/char_types.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd
index ae921c6ead9..408b3687c4a 100644
--- a/python/cudf/cudf/_lib/cpp/strings/char_types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
 
 cdef extern from "cudf/strings/char_types/char_types.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/strings/combine.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/combine.pxd
similarity index 80%
rename from python/cudf/cudf/_lib/cpp/strings/combine.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/combine.pxd
index 2b10427283f..b05e46af0d6 100644
--- a/python/cudf/cudf/_lib/cpp/strings/combine.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/combine.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/contains.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/contains.pxd
similarity index 64%
rename from python/cudf/cudf/_lib/cpp/strings/contains.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/contains.pxd
index 94c2fb21fc1..f8ed253ff3c 100644
--- a/python/cudf/cudf/_lib/cpp/strings/contains.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/contains.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
 
 
 cdef extern from "cudf/strings/contains.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/strings/convert/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/strings/convert/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_booleans.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
similarity index 62%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_booleans.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
index 96cb43973f1..daac2b5be28 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_booleans.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
 
 cdef extern from "cudf/strings/convert/convert_booleans.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_datetime.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
similarity index 70%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_datetime.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
index 5e7380c1d4e..263cee4fe1e 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_datetime.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_datetime.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_durations.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_durations.pxd
similarity index 65%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_durations.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_durations.pxd
index 8c54fd52aa2..af357b9bde4 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_durations.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_durations.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_durations.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_fixed_point.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
similarity index 66%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_fixed_point.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
index a993c5b17b8..91c1abdb5e4 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_fixed_point.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_fixed_point.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_floats.pxd
similarity index 64%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_floats.pxd
index 6388f43077d..5fbf2be0244 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_floats.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_floats.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_integers.pxd
similarity index 75%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_integers.pxd
index b5443979b81..3d6c59cbfcf 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_integers.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_ipv4.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_ipv4.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
index d6e881caea4..86de956b6b6 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_ipv4.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/convert/convert_ipv4.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_lists.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_lists.pxd
similarity index 53%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_lists.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_lists.pxd
index 99bb80a813d..aba2dbcca64 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_lists.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
 
 cdef extern from "cudf/strings/convert/convert_lists.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_urls.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_urls.pxd
similarity index 62%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_urls.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_urls.pxd
index 5d9991dd610..fb7e0cae6de 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_urls.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_urls.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/convert/convert_urls.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/extract.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/extract.pxd
new file mode 100644
index 00000000000..57903ca27de
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/extract.pxd
@@ -0,0 +1,15 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+
+
+cdef extern from "cudf/strings/extract.hpp" namespace "cudf::strings" nogil:
+
+    cdef unique_ptr[table] extract(
+        column_view source_strings,
+        regex_program) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/find.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find.pxd
similarity index 83%
rename from python/cudf/cudf/_lib/cpp/strings/find.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find.pxd
index dfbdebb9651..04e2ed554ee 100644
--- a/python/cudf/cudf/_lib/cpp/strings/find.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find.pxd
@@ -3,10 +3,10 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/find.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/find_multiple.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find_multiple.pxd
similarity index 58%
rename from python/cudf/cudf/_lib/cpp/strings/find_multiple.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find_multiple.pxd
index 27b19728f60..1f1adc8e99f 100644
--- a/python/cudf/cudf/_lib/cpp/strings/find_multiple.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find_multiple.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/find_multiple.hpp" namespace "cudf::strings" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/findall.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/findall.pxd
new file mode 100644
index 00000000000..4bc450b8911
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/findall.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+
+
+cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil:
+
+    cdef unique_ptr[column] findall(
+        column_view source_strings,
+        regex_program) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/json.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/json.pxd
similarity index 75%
rename from python/cudf/cudf/_lib/cpp/strings/json.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/json.pxd
index eed627c96b5..5926fa1d29f 100644
--- a/python/cudf/cudf/_lib/cpp/strings/json.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/json.pxd
@@ -1,12 +1,12 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar, string_scalar
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar, string_scalar
 
 
 cdef extern from "cudf/json/json.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/padding.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/padding.pxd
similarity index 54%
rename from python/cudf/cudf/_lib/cpp/strings/padding.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/padding.pxd
index c3906a5b4c6..26681a1aa00 100644
--- a/python/cudf/cudf/_lib/cpp/strings/padding.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/padding.pxd
@@ -1,13 +1,13 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.side_type cimport side_type
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.side_type cimport side_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/padding.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/regex_flags.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/strings/regex_flags.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd
diff --git a/python/cudf/cudf/_lib/cpp/strings/regex_program.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_program.pxd
similarity index 74%
rename from python/cudf/cudf/_lib/cpp/strings/regex_program.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_program.pxd
index 7818c9c7d01..e92c8bd7737 100644
--- a/python/cudf/cudf/_lib/cpp/strings/regex_program.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_program.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.strings.regex_flags cimport regex_flags
+from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
 
 
 cdef extern from "cudf/strings/regex/regex_program.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/strings/repeat.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/repeat.pxd
similarity index 60%
rename from python/cudf/cudf/_lib/cpp/strings/repeat.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/repeat.pxd
index 2a6754b9a11..9e128529406 100644
--- a/python/cudf/cudf/_lib/cpp/strings/repeat.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/repeat.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/repeat_strings.hpp" namespace "cudf::strings" \
diff --git a/python/cudf/cudf/_lib/cpp/strings/replace.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd
similarity index 68%
rename from python/cudf/cudf/_lib/cpp/strings/replace.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd
index 2a9c6913bb3..92e142b33fc 100644
--- a/python/cudf/cudf/_lib/cpp/strings/replace.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd
@@ -1,13 +1,13 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/replace.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/replace_re.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace_re.pxd
similarity index 59%
rename from python/cudf/cudf/_lib/cpp/strings/replace_re.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace_re.pxd
index 94f3d0528a5..739505cd51d 100644
--- a/python/cudf/cudf/_lib/cpp/strings/replace_re.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace_re.pxd
@@ -1,15 +1,15 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.regex_program cimport regex_program
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/replace_re.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/side_type.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/side_type.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/strings/side_type.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/side_type.pxd
diff --git a/python/cudf/cudf/_lib/cpp/strings/split/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/strings/split/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/strings/split/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/strings/split/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/strings/split/partition.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/partition.pxd
similarity index 56%
rename from python/cudf/cudf/_lib/cpp/strings/split/partition.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/partition.pxd
index fb83512e9f0..5119124b3e3 100644
--- a/python/cudf/cudf/_lib/cpp/strings/split/partition.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/partition.pxd
@@ -1,12 +1,12 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
 
 
 cdef extern from "cudf/strings/split/partition.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/split/split.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/split.pxd
similarity index 76%
rename from python/cudf/cudf/_lib/cpp/strings/split/split.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/split.pxd
index d6207cd5c76..4f75664e47a 100644
--- a/python/cudf/cudf/_lib/cpp/strings/split/split.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/split.pxd
@@ -1,14 +1,14 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.regex_program cimport regex_program
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/split/split.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/strip.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/strip.pxd
new file mode 100644
index 00000000000..2d6fd6a9e89
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/strip.pxd
@@ -0,0 +1,16 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.side_type cimport side_type
+
+
+cdef extern from "cudf/strings/strip.hpp" namespace "cudf::strings" nogil:
+
+    cdef unique_ptr[column] strip(
+        column_view source_strings,
+        side_type stype,
+        string_scalar to_strip) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/substring.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/substring.pxd
similarity index 60%
rename from python/cudf/cudf/_lib/cpp/strings/substring.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/substring.pxd
index 99ea8c7ff3f..02123cc0807 100644
--- a/python/cudf/cudf/_lib/cpp/strings/substring.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/substring.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport numeric_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/slice.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/translate.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/translate.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/cpp/strings/translate.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/translate.pxd
index 3239ba314e4..b23ac277216 100644
--- a/python/cudf/cudf/_lib/cpp/strings/translate.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/translate.pxd
@@ -1,14 +1,14 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport char_utf8
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport char_utf8
 
 
 cdef extern from "cudf/strings/translate.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/wrap.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/wrap.pxd
new file mode 100644
index 00000000000..1d92d445634
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/wrap.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+
+cdef extern from "cudf/strings/wrap.hpp" namespace "cudf::strings" nogil:
+
+    cdef unique_ptr[column] wrap(
+        column_view source_strings,
+        size_type width) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings_udf.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd
similarity index 81%
rename from python/cudf/cudf/_lib/cpp/strings_udf.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd
index 7d45bc858f5..b895d5e6925 100644
--- a/python/cudf/cudf/_lib/cpp/strings_udf.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t, uint16_t
 from libcpp.memory cimport unique_ptr
@@ -7,9 +7,9 @@ from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/udf/udf_string.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/table/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/table/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/table/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/table/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/table/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/table/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/table/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/table/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/table/table.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/table/table.pxd
similarity index 62%
rename from python/cudf/cudf/_lib/cpp/table/table.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/table/table.pxd
index ac93e3def19..737a1327d45 100644
--- a/python/cudf/cudf/_lib/cpp/table/table.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/table/table.pxd
@@ -1,11 +1,14 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.table.table_view cimport mutable_table_view, table_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport (
+    mutable_table_view,
+    table_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/table/table.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/table/table_view.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/table/table_view.pxd
similarity index 77%
rename from python/cudf/cudf/_lib/cpp/table/table_view.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/table/table_view.pxd
index 728b6d2be4b..00e1a89c025 100644
--- a/python/cudf/cudf/_lib/cpp/table/table_view.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/table/table_view.pxd
@@ -1,9 +1,12 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/table/table_view.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/transform.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/transform.pxd
similarity index 70%
rename from python/cudf/cudf/_lib/cpp/transform.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/transform.pxd
index d9de04b676e..b0a978fe5c5 100644
--- a/python/cudf/cudf/_lib/cpp/transform.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/transform.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -7,12 +7,16 @@ from libcpp.string cimport string
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.expressions cimport expression
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport bitmask_type, data_type, size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.expressions cimport expression
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    bitmask_type,
+    data_type,
+    size_type,
+)
 
 
 cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/transpose.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/transpose.pxd
similarity index 59%
rename from python/cudf/cudf/_lib/cpp/transpose.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/transpose.pxd
index 8cbfb0055bd..5dcb9c165ad 100644
--- a/python/cudf/cudf/_lib/cpp/transpose.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/transpose.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/transpose.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/types.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd
diff --git a/python/cudf/cudf/_lib/cpp/types.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/types.pyx
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/types.pyx
diff --git a/python/cudf/cudf/_lib/cpp/unary.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd
similarity index 84%
rename from python/cudf/cudf/_lib/cpp/unary.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd
index cc07290b6c4..7f8ae2b7617 100644
--- a/python/cudf/cudf/_lib/cpp/unary.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd
@@ -3,9 +3,9 @@
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/unary.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/unary.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/unary.pyx
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pyx
diff --git a/python/cudf/cudf/_lib/cpp/utilities/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/utilities/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/utilities/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/utilities/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/utilities/host_span.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/host_span.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/utilities/host_span.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/host_span.pxd
diff --git a/python/cudf/cudf/_lib/cpp/wrappers/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/wrappers/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/wrappers/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/wrappers/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/decimals.pxd
similarity index 82%
rename from python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/decimals.pxd
index 858569fd696..09b0c87e4b8 100644
--- a/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/decimals.pxd
@@ -1,8 +1,8 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t
 
-from cudf._lib.cpp.types cimport int128
+from cudf._lib.pylibcudf.libcudf.types cimport int128
 
 
 cdef extern from "cudf/fixed_point/fixed_point.hpp" namespace "numeric" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/wrappers/durations.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/durations.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/wrappers/durations.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/durations.pxd
diff --git a/python/cudf/cudf/_lib/cpp/wrappers/timestamps.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/timestamps.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/wrappers/timestamps.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/timestamps.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index cf96dfcb81e..b780d299977 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .table cimport Table
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index faeca56286e..654f39742b6 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -3,9 +3,9 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp.lists cimport explode as cpp_explode
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.lists cimport explode as cpp_explode
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .table cimport Table
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/merge.pyx b/python/cudf/cudf/_lib/pylibcudf/merge.pyx
index 91b2b0ea65b..5aa46c142f6 100644
--- a/python/cudf/cudf/_lib/pylibcudf/merge.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/merge.pyx
@@ -4,10 +4,10 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp cimport merge as cpp_merge
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport null_order, order, size_type
+from cudf._lib.pylibcudf.libcudf cimport merge as cpp_merge
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, order, size_type
 
 from .table cimport Table
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/reduce.pxd b/python/cudf/cudf/_lib/pylibcudf/reduce.pxd
index a613e877ce2..935efd4acf2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/reduce.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/reduce.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.reduce cimport scan_type
+from cudf._lib.pylibcudf.libcudf.reduce cimport scan_type
 
 from .aggregation cimport Aggregation
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/reduce.pyx b/python/cudf/cudf/_lib/pylibcudf/reduce.pyx
index d12da712fcf..c272f183007 100644
--- a/python/cudf/cudf/_lib/pylibcudf/reduce.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/reduce.pyx
@@ -4,18 +4,22 @@ from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move, pair
 
-from cudf._lib.cpp cimport reduce as cpp_reduce
-from cudf._lib.cpp.aggregation cimport reduce_aggregation, scan_aggregation
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.reduce cimport scan_type
-from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf cimport reduce as cpp_reduce
+from cudf._lib.pylibcudf.libcudf.aggregation cimport (
+    reduce_aggregation,
+    scan_aggregation,
+)
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.reduce cimport scan_type
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 from .aggregation cimport Aggregation
 from .column cimport Column
 from .scalar cimport Scalar
 from .types cimport DataType
 
-from cudf._lib.cpp.reduce import scan_type as ScanType  # no-cython-lint
+from cudf._lib.pylibcudf.libcudf.reduce import \
+    scan_type as ScanType  # no-cython-lint
 
 
 cpdef Scalar reduce(Column col, Aggregation agg, DataType data_type):
diff --git a/python/cudf/cudf/_lib/pylibcudf/replace.pxd b/python/cudf/cudf/_lib/pylibcudf/replace.pxd
index fc42b985c8e..40484c728db 100644
--- a/python/cudf/cudf/_lib/pylibcudf/replace.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/replace.pxd
@@ -2,7 +2,7 @@
 
 from libcpp cimport bool
 
-from cudf._lib.cpp.replace cimport replace_policy
+from cudf._lib.pylibcudf.libcudf.replace cimport replace_policy
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/replace.pyx b/python/cudf/cudf/_lib/pylibcudf/replace.pyx
index dd3a733ee3a..6e08e8f64a9 100644
--- a/python/cudf/cudf/_lib/pylibcudf/replace.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/replace.pyx
@@ -7,10 +7,10 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp cimport replace as cpp_replace
-from cudf._lib.cpp.column.column cimport column
+from cudf._lib.pylibcudf.libcudf cimport replace as cpp_replace
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
 
-from cudf._lib.cpp.replace import \
+from cudf._lib.pylibcudf.libcudf.replace import \
     replace_policy as ReplacePolicy  # no-cython-lint
 
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/rolling.pxd b/python/cudf/cudf/_lib/pylibcudf/rolling.pxd
index 88d683c0c35..cdadee68d43 100644
--- a/python/cudf/cudf/_lib/pylibcudf/rolling.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/rolling.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .aggregation cimport Aggregation
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/rolling.pyx b/python/cudf/cudf/_lib/pylibcudf/rolling.pyx
index 8a1d83911ca..7aa7828a5dd 100644
--- a/python/cudf/cudf/_lib/pylibcudf/rolling.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/rolling.pyx
@@ -4,10 +4,10 @@ from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp cimport rolling as cpp_rolling
-from cudf._lib.cpp.aggregation cimport rolling_aggregation
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf cimport rolling as cpp_rolling
+from cudf._lib.pylibcudf.libcudf.aggregation cimport rolling_aggregation
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .aggregation cimport Aggregation
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd
index 85744eca902..3de86d93519 100644
--- a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd
@@ -5,7 +5,7 @@ from libcpp.memory cimport unique_ptr
 
 from rmm._lib.memory_resource cimport DeviceMemoryResource
 
-from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 from .types cimport DataType
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
index 4a2d8f393bd..6799c37cea2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
@@ -5,7 +5,7 @@ from libcpp.memory cimport unique_ptr
 
 from rmm._lib.memory_resource cimport get_current_device_resource
 
-from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 from .types cimport DataType
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/search.pyx b/python/cudf/cudf/_lib/pylibcudf/search.pyx
index a186167af13..151a39f204f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/search.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/search.pyx
@@ -4,9 +4,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp cimport search as cpp_search
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.types cimport null_order, order
+from cudf._lib.pylibcudf.libcudf cimport search as cpp_search
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, order
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/sorting.pxd b/python/cudf/cudf/_lib/pylibcudf/sorting.pxd
index 3ed241622c0..a4ea541a03b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/sorting.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/sorting.pxd
@@ -2,8 +2,13 @@
 
 from libcpp cimport bool
 
-from cudf._lib.cpp.aggregation cimport rank_method
-from cudf._lib.cpp.types cimport null_order, null_policy, order, size_type
+from cudf._lib.pylibcudf.libcudf.aggregation cimport rank_method
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    null_order,
+    null_policy,
+    order,
+    size_type,
+)
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/sorting.pyx b/python/cudf/cudf/_lib/pylibcudf/sorting.pyx
index 1668a3efc7c..8c5a8e26899 100644
--- a/python/cudf/cudf/_lib/pylibcudf/sorting.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/sorting.pyx
@@ -4,11 +4,11 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp cimport sorting as cpp_sorting
-from cudf._lib.cpp.aggregation cimport rank_method
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport null_order, null_policy, order
+from cudf._lib.pylibcudf.libcudf cimport sorting as cpp_sorting
+from cudf._lib.pylibcudf.libcudf.aggregation cimport rank_method
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, null_policy, order
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd
index 29acc21fc05..6f89aaf90e7 100644
--- a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd
@@ -1,7 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.stream_compaction cimport duplicate_keep_option
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.stream_compaction cimport (
+    duplicate_keep_option,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport (
     nan_equality,
     nan_policy,
     null_equality,
diff --git a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx
index af7a85d31bf..43449d3690a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx
@@ -4,11 +4,15 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp cimport stream_compaction as cpp_stream_compaction
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.stream_compaction cimport duplicate_keep_option
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf cimport (
+    stream_compaction as cpp_stream_compaction,
+)
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.stream_compaction cimport (
+    duplicate_keep_option,
+)
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport (
     nan_equality,
     nan_policy,
     null_equality,
@@ -16,7 +20,7 @@ from cudf._lib.cpp.types cimport (
     size_type,
 )
 
-from cudf._lib.cpp.stream_compaction import \
+from cudf._lib.pylibcudf.libcudf.stream_compaction import \
     duplicate_keep_option as DuplicateKeepOption  # no-cython-lint, isort:skip
 
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx
index 69910fd8c50..3a360fd6b10 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx
@@ -3,9 +3,9 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.strings cimport case as cpp_case
 from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.strings cimport case as cpp_case
 
 
 cpdef Column to_lower(Column input):
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd
index 22e933106c7..bb43069f190 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd
@@ -1,7 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.types cimport size_type
 from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.pylibcudf.scalar cimport Scalar
 
 ctypedef fused ColumnOrScalar:
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/find.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/find.pyx
index 1d94132a8b3..a0214efd0a1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/find.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/find.pyx
@@ -2,14 +2,14 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.strings cimport find as cpp_find
 from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.strings cimport find as cpp_find
 from cudf._lib.pylibcudf.scalar cimport Scalar
 
 from cython.operator import dereference
 
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
 
 cpdef Column find(
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pxd b/python/cudf/cudf/_lib/pylibcudf/table.pxd
index 7467bfccaa8..e476fc770e3 100644
--- a/python/cudf/cudf/_lib/pylibcudf/table.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/table.pxd
@@ -2,8 +2,8 @@
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef class Table:
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pyx b/python/cudf/cudf/_lib/pylibcudf/table.pyx
index 1fa60ec2b6c..d93ac78721b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/table.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/table.pyx
@@ -5,9 +5,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
 
 from .column cimport Column
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/types.pxd
index 6c53636d332..e54a259819e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pxd
@@ -3,7 +3,7 @@
 from libc.stdint cimport int32_t
 from libcpp cimport bool as cbool
 
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.types cimport (
     data_type,
     interpolation,
     mask_state,
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx
index baf92223714..ebe4d66fa20 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx
@@ -2,17 +2,17 @@
 
 from libc.stdint cimport int32_t
 
-from cudf._lib.cpp.types cimport data_type, type_id
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
 
-from cudf._lib.cpp.types import type_id as TypeId  # no-cython-lint, isort:skip
-from cudf._lib.cpp.types import nan_policy as NanPolicy  # no-cython-lint, isort:skip
-from cudf._lib.cpp.types import null_policy as NullPolicy  # no-cython-lint, isort:skip
-from cudf._lib.cpp.types import interpolation as Interpolation  # no-cython-lint, isort:skip
-from cudf._lib.cpp.types import nan_equality as NanEquality  # no-cython-lint, isort:skip
-from cudf._lib.cpp.types import null_equality as NullEquality  # no-cython-lint, isort:skip
-from cudf._lib.cpp.types import null_order as NullOrder  # no-cython-lint, isort:skip
-from cudf._lib.cpp.types import order as Order  # no-cython-lint, isort:skip
-from cudf._lib.cpp.types import sorted as Sorted  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import type_id as TypeId  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import nan_policy as NanPolicy  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import null_policy as NullPolicy  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import interpolation as Interpolation  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import nan_equality as NanEquality  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import null_equality as NullEquality  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import null_order as NullOrder  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import order as Order  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import sorted as Sorted  # no-cython-lint, isort:skip
 
 
 cdef class DataType:
diff --git a/python/cudf/cudf/_lib/pylibcudf/unary.pxd b/python/cudf/cudf/_lib/pylibcudf/unary.pxd
index b4372db4ae2..4aa4543bb80 100644
--- a/python/cudf/cudf/_lib/pylibcudf/unary.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/unary.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.unary cimport unary_operator
+from cudf._lib.pylibcudf.libcudf.unary cimport unary_operator
 
 from .column cimport Column
 from .types cimport DataType
diff --git a/python/cudf/cudf/_lib/pylibcudf/unary.pyx b/python/cudf/cudf/_lib/pylibcudf/unary.pyx
index 437dd313e85..0879b501a49 100644
--- a/python/cudf/cudf/_lib/pylibcudf/unary.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/unary.pyx
@@ -3,11 +3,11 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp cimport unary as cpp_unary
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.unary cimport unary_operator
+from cudf._lib.pylibcudf.libcudf cimport unary as cpp_unary
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.unary cimport unary_operator
 
-from cudf._lib.cpp.unary import \
+from cudf._lib.pylibcudf.libcudf.unary import \
     unary_operator as UnaryOperator  # no-cython-lint
 
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/utils.pxd b/python/cudf/cudf/_lib/pylibcudf/utils.pxd
index 7efeaaf7e24..77c05086397 100644
--- a/python/cudf/cudf/_lib/pylibcudf/utils.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/utils.pxd
@@ -3,8 +3,8 @@
 from libcpp.functional cimport reference_wrapper
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport bitmask_type
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.types cimport bitmask_type
 
 
 cdef void * int_to_void_ptr(Py_ssize_t ptr) nogil
diff --git a/python/cudf/cudf/_lib/pylibcudf/utils.pyx b/python/cudf/cudf/_lib/pylibcudf/utils.pyx
index ea34a87a72a..b4427e8ecff 100644
--- a/python/cudf/cudf/_lib/pylibcudf/utils.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/utils.pyx
@@ -6,8 +6,8 @@ from libc.stdint cimport uintptr_t
 from libcpp.functional cimport reference_wrapper
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport bitmask_type
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.types cimport bitmask_type
 
 from .scalar cimport Scalar
 
diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx
index d3a02fa7cbf..3d20454a7ce 100644
--- a/python/cudf/cudf/_lib/quantiles.pyx
+++ b/python/cudf/cudf/_lib/quantiles.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -17,15 +17,20 @@ from cudf._lib.types cimport (
 
 from cudf._lib.types import Interpolation
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.quantiles cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.quantiles cimport (
     quantile as cpp_quantile,
     quantiles as cpp_quantile_table,
 )
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport interpolation, null_order, order, sorted
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    interpolation,
+    null_order,
+    order,
+    sorted,
+)
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
diff --git a/python/cudf/cudf/_lib/reshape.pyx b/python/cudf/cudf/_lib/reshape.pyx
index c237b7b1389..48e386bcf02 100644
--- a/python/cudf/cudf/_lib/reshape.pyx
+++ b/python/cudf/cudf/_lib/reshape.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,14 +6,14 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.reshape cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.reshape cimport (
     interleave_columns as cpp_interleave_columns,
     tile as cpp_tile,
 )
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
diff --git a/python/cudf/cudf/_lib/round.pyx b/python/cudf/cudf/_lib/round.pyx
index 7eddb1b8cbd..c1c36dd8854 100644
--- a/python/cudf/cudf/_lib/round.pyx
+++ b/python/cudf/cudf/_lib/round.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,9 +6,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.round cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.round cimport (
     round as cpp_round,
     rounding_method as cpp_rounding_method,
 )
diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd
index 154ee22e796..b57acbb37f1 100644
--- a/python/cudf/cudf/_lib/scalar.pxd
+++ b/python/cudf/cudf/_lib/scalar.pxd
@@ -5,7 +5,7 @@ from libcpp.memory cimport unique_ptr
 
 from rmm._lib.memory_resource cimport DeviceMemoryResource
 
-from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef class DeviceScalar:
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index aee496e9f1c..e68398498d1 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -17,27 +17,27 @@ from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES
 from cudf.core.dtypes import ListDtype, StructDtype
 from cudf.core.missing import NA, NaT
 
-cimport cudf._lib.cpp.types as libcudf_types
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
 # We currently need this cimport because some of the implementations here
 # access the c_obj of the scalar, and because we need to be able to call
 # pylibcudf.Scalar.from_libcudf. Both of those are temporarily acceptable until
 # DeviceScalar is phased out entirely from cuDF Cython (at which point
 # cudf.Scalar will be directly backed by pylibcudf.Scalar).
-from cudf._lib cimport pylibcudf
-from cudf._lib.cpp.scalar.scalar cimport (
+from cudf._lib.pylibcudf cimport Scalar as plc_Scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
     duration_scalar,
     list_scalar,
     scalar,
     struct_scalar,
     timestamp_scalar,
 )
-from cudf._lib.cpp.wrappers.durations cimport (
+from cudf._lib.pylibcudf.libcudf.wrappers.durations cimport (
     duration_ms,
     duration_ns,
     duration_s,
     duration_us,
 )
-from cudf._lib.cpp.wrappers.timestamps cimport (
+from cudf._lib.pylibcudf.libcudf.wrappers.timestamps cimport (
     timestamp_ms,
     timestamp_ns,
     timestamp_s,
@@ -206,7 +206,7 @@ cdef class DeviceScalar:
         return self._to_host_scalar()
 
     cdef const scalar* get_raw_ptr(self) except *:
-        return (<pylibcudf.Scalar> self.c_value).c_obj.get()
+        return (<plc_Scalar> self.c_value).c_obj.get()
 
     cpdef bool is_valid(self):
         """
@@ -230,7 +230,7 @@ cdef class DeviceScalar:
         """
         cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar)
         # Note: This line requires pylibcudf to be cimported
-        s.c_value = pylibcudf.Scalar.from_libcudf(move(ptr))
+        s.c_value = plc_Scalar.from_libcudf(move(ptr))
         s._set_dtype(dtype)
         return s
 
@@ -369,11 +369,11 @@ def _create_proxy_nat_scalar(dtype):
         nat = dtype.type('NaT').astype(dtype)
         if dtype.type == np.datetime64:
             _set_datetime64_from_np_scalar(
-                (<pylibcudf.Scalar> result.c_value).c_obj, nat, dtype, True
+                (<plc_Scalar> result.c_value).c_obj, nat, dtype, True
             )
         elif dtype.type == np.timedelta64:
             _set_timedelta64_from_np_scalar(
-                (<pylibcudf.Scalar> result.c_value).c_obj, nat, dtype, True
+                (<plc_Scalar> result.c_value).c_obj, nat, dtype, True
             )
         return result
     else:
diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx
index b2b84c17cf4..ff9565b9a89 100644
--- a/python/cudf/cudf/_lib/sort.pyx
+++ b/python/cudf/cudf/_lib/sort.pyx
@@ -10,11 +10,11 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.aggregation cimport rank_method
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.search cimport lower_bound, upper_bound
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport null_order, order as cpp_order
+from cudf._lib.pylibcudf.libcudf.aggregation cimport rank_method
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.search cimport lower_bound, upper_bound
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, order as cpp_order
 from cudf._lib.utils cimport (
     columns_from_pylibcudf_table,
     table_view_from_columns,
diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
index 3826e71f850..dfad7fd101c 100644
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ b/python/cudf/cudf/_lib/string_casting.pyx
@@ -12,39 +12,39 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.convert.convert_booleans cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_booleans cimport (
     from_booleans as cpp_from_booleans,
     to_booleans as cpp_to_booleans,
 )
-from cudf._lib.cpp.strings.convert.convert_datetime cimport (
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_datetime cimport (
     from_timestamps as cpp_from_timestamps,
     is_timestamp as cpp_is_timestamp,
     to_timestamps as cpp_to_timestamps,
 )
-from cudf._lib.cpp.strings.convert.convert_durations cimport (
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_durations cimport (
     from_durations as cpp_from_durations,
     to_durations as cpp_to_durations,
 )
-from cudf._lib.cpp.strings.convert.convert_floats cimport (
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_floats cimport (
     from_floats as cpp_from_floats,
     to_floats as cpp_to_floats,
 )
-from cudf._lib.cpp.strings.convert.convert_integers cimport (
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_integers cimport (
     from_integers as cpp_from_integers,
     hex_to_integers as cpp_hex_to_integers,
     integers_to_hex as cpp_integers_to_hex,
     is_hex as cpp_is_hex,
     to_integers as cpp_to_integers,
 )
-from cudf._lib.cpp.strings.convert.convert_ipv4 cimport (
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_ipv4 cimport (
     integers_to_ipv4 as cpp_integers_to_ipv4,
     ipv4_to_integers as cpp_ipv4_to_integers,
     is_ipv4 as cpp_is_ipv4,
 )
-from cudf._lib.cpp.types cimport data_type, type_id
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
 from cudf._lib.types cimport underlying_type_t_type_id
 
 import cudf
diff --git a/python/cudf/cudf/_lib/strings/attributes.pyx b/python/cudf/cudf/_lib/strings/attributes.pyx
index c1b69dda353..1f3d7c4eb1b 100644
--- a/python/cudf/cudf/_lib/strings/attributes.pyx
+++ b/python/cudf/cudf/_lib/strings/attributes.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,9 +6,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.attributes cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.attributes cimport (
     code_points as cpp_code_points,
     count_bytes as cpp_count_bytes,
     count_characters as cpp_count_characters,
diff --git a/python/cudf/cudf/_lib/strings/capitalize.pyx b/python/cudf/cudf/_lib/strings/capitalize.pyx
index f6a80ac8fbe..1420a2bbaf2 100644
--- a/python/cudf/cudf/_lib/strings/capitalize.pyx
+++ b/python/cudf/cudf/_lib/strings/capitalize.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,9 +6,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.capitalize cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.capitalize cimport (
     capitalize as cpp_capitalize,
     is_title as cpp_is_title,
     title as cpp_title,
diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx
index 14d78cdaa51..5b7b6d19d9e 100644
--- a/python/cudf/cudf/_lib/strings/char_types.pyx
+++ b/python/cudf/cudf/_lib/strings/char_types.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 
 from libcpp cimport bool
@@ -8,10 +8,10 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.char_types cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.char_types cimport (
     all_characters_of_type as cpp_all_characters_of_type,
     filter_characters_of_type as cpp_filter_characters_of_type,
     string_character_types,
diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx
index 7d86d34ab25..288f333d4d8 100644
--- a/python/cudf/cudf/_lib/strings/combine.pyx
+++ b/python/cudf/cudf/_lib/strings/combine.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,17 +6,17 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.combine cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.combine cimport (
     concatenate as cpp_concatenate,
     join_list_elements as cpp_join_list_elements,
     join_strings as cpp_join_strings,
     output_if_empty_list,
     separator_on_nulls,
 )
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport table_view_from_columns
 
diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx
index 82034f7f8b7..087acd8062d 100644
--- a/python/cudf/cudf/_lib/strings/contains.pyx
+++ b/python/cudf/cudf/_lib/strings/contains.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cython.operator cimport dereference
 from libc.stdint cimport uint32_t
@@ -10,17 +10,17 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.contains cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.contains cimport (
     contains_re as cpp_contains_re,
     count_re as cpp_count_re,
     like as cpp_like,
     matches_re as cpp_matches_re,
 )
-from cudf._lib.cpp.strings.regex_flags cimport regex_flags
-from cudf._lib.cpp.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
index 2085d5c2896..6faff606226 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 import cudf
 
@@ -8,14 +8,14 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.convert.convert_fixed_point cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_fixed_point cimport (
     from_fixed_point as cpp_from_fixed_point,
     is_fixed_point as cpp_is_fixed_point,
     to_fixed_point as cpp_to_fixed_point,
 )
-from cudf._lib.cpp.types cimport data_type, type_id
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
index d1617d85593..341cbc99dab 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,9 +6,9 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.convert.convert_floats cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_floats cimport (
     is_float as cpp_is_float,
 )
 
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
index 52a4791775a..081b03cdc0d 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,9 +6,9 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.convert.convert_integers cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_integers cimport (
     is_integer as cpp_is_integer,
 )
 
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
index 1a89fa7604b..4418bf2a72d 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,10 +6,10 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.convert.convert_lists cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_lists cimport (
     format_list_column as cpp_format_list_column,
 )
 
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
index bc8123281f0..5f62efe5c00 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,9 +6,9 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.convert.convert_urls cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_urls cimport (
     url_decode as cpp_url_decode,
     url_encode as cpp_url_encode,
 )
diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx
index d3d8610cdf0..3b80c4f6368 100644
--- a/python/cudf/cudf/_lib/strings/extract.pyx
+++ b/python/cudf/cudf/_lib/strings/extract.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cython.operator cimport dereference
 from libc.stdint cimport uint32_t
@@ -9,11 +9,11 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.extract cimport extract as cpp_extract
-from cudf._lib.cpp.strings.regex_flags cimport regex_flags
-from cudf._lib.cpp.strings.regex_program cimport regex_program
-from cudf._lib.cpp.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.extract cimport extract as cpp_extract
+from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.utils cimport data_from_unique_ptr
 
 
diff --git a/python/cudf/cudf/_lib/strings/find.pyx b/python/cudf/cudf/_lib/strings/find.pyx
index 341776b102c..3c0009ee569 100644
--- a/python/cudf/cudf/_lib/strings/find.pyx
+++ b/python/cudf/cudf/_lib/strings/find.pyx
@@ -4,7 +4,7 @@ import cudf._lib.pylibcudf as plc
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/find_multiple.pyx b/python/cudf/cudf/_lib/strings/find_multiple.pyx
index c2a97a4fd7c..c75f28db21b 100644
--- a/python/cudf/cudf/_lib/strings/find_multiple.pyx
+++ b/python/cudf/cudf/_lib/strings/find_multiple.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,9 +6,9 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.find_multiple cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.find_multiple cimport (
     find_multiple as cpp_find_multiple,
 )
 
diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx
index 6df1d32dcfe..0d409889bc8 100644
--- a/python/cudf/cudf/_lib/strings/findall.pyx
+++ b/python/cudf/cudf/_lib/strings/findall.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 from cython.operator cimport dereference
 from libc.stdint cimport uint32_t
@@ -9,11 +9,11 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.findall cimport findall as cpp_findall
-from cudf._lib.cpp.strings.regex_flags cimport regex_flags
-from cudf._lib.cpp.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.findall cimport findall as cpp_findall
+from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/json.pyx b/python/cudf/cudf/_lib/strings/json.pyx
index 861e0daa6e3..560f284b56c 100644
--- a/python/cudf/cudf/_lib/strings/json.pyx
+++ b/python/cudf/cudf/_lib/strings/json.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,10 +6,10 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.json cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.json cimport (
     get_json_object as cpp_get_json_object,
     get_json_object_options,
 )
diff --git a/python/cudf/cudf/_lib/strings/padding.pyx b/python/cudf/cudf/_lib/strings/padding.pyx
index 340d7eb52d8..9226810951f 100644
--- a/python/cudf/cudf/_lib/strings/padding.pyx
+++ b/python/cudf/cudf/_lib/strings/padding.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
@@ -7,14 +7,17 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from enum import IntEnum
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.strings.padding cimport pad as cpp_pad, zfill as cpp_zfill
-from cudf._lib.cpp.strings.side_type cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.strings.padding cimport (
+    pad as cpp_pad,
+    zfill as cpp_zfill,
+)
+from cudf._lib.pylibcudf.libcudf.strings.side_type cimport (
     side_type,
     underlying_type_t_side_type,
 )
diff --git a/python/cudf/cudf/_lib/strings/repeat.pyx b/python/cudf/cudf/_lib/strings/repeat.pyx
index 4896fb74f41..2b8116848cf 100644
--- a/python/cudf/cudf/_lib/strings/repeat.pyx
+++ b/python/cudf/cudf/_lib/strings/repeat.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,10 +6,10 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings cimport repeat as cpp_repeat
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings cimport repeat as cpp_repeat
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/replace.pyx b/python/cudf/cudf/_lib/strings/replace.pyx
index 80c9ba95fd8..880201e65a2 100644
--- a/python/cudf/cudf/_lib/strings/replace.pyx
+++ b/python/cudf/cudf/_lib/strings/replace.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
@@ -7,14 +7,14 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.replace cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.replace cimport (
     replace as cpp_replace,
     replace_slice as cpp_replace_slice,
 )
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/replace_re.pyx b/python/cudf/cudf/_lib/strings/replace_re.pyx
index 1fbbaa8f44f..e13880a6186 100644
--- a/python/cudf/cudf/_lib/strings/replace_re.pyx
+++ b/python/cudf/cudf/_lib/strings/replace_re.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
@@ -9,16 +9,16 @@ from libcpp.vector cimport vector
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.regex_flags cimport regex_flags
-from cudf._lib.cpp.strings.regex_program cimport regex_program
-from cudf._lib.cpp.strings.replace_re cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.strings.replace_re cimport (
     replace_re as cpp_replace_re,
     replace_with_backrefs as cpp_replace_with_backrefs,
 )
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx
index 281d131372a..be377c0f86b 100644
--- a/python/cudf/cudf/_lib/strings/split/partition.pyx
+++ b/python/cudf/cudf/_lib/strings/split/partition.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,13 +6,13 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.split.partition cimport (
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.split.partition cimport (
     partition as cpp_partition,
     rpartition as cpp_rpartition,
 )
-from cudf._lib.cpp.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport data_from_unique_ptr
 
diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx
index 08c7dde921f..942235686d7 100644
--- a/python/cudf/cudf/_lib/strings/split/split.pyx
+++ b/python/cudf/cudf/_lib/strings/split/split.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
@@ -8,12 +8,12 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.regex_flags cimport regex_flags
-from cudf._lib.cpp.strings.regex_program cimport regex_program
-from cudf._lib.cpp.strings.split.split cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.strings.split.split cimport (
     rsplit as cpp_rsplit,
     rsplit_re as cpp_rsplit_re,
     rsplit_record as cpp_rsplit_record,
@@ -23,8 +23,8 @@ from cudf._lib.cpp.strings.split.split cimport (
     split_record as cpp_split_record,
     split_record_re as cpp_split_record_re,
 )
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport data_from_unique_ptr
 
diff --git a/python/cudf/cudf/_lib/strings/strip.pyx b/python/cudf/cudf/_lib/strings/strip.pyx
index 2c53782d6ba..199fa5fc3b6 100644
--- a/python/cudf/cudf/_lib/strings/strip.pyx
+++ b/python/cudf/cudf/_lib/strings/strip.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,11 +6,11 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.side_type cimport side_type
-from cudf._lib.cpp.strings.strip cimport strip as cpp_strip
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.side_type cimport side_type
+from cudf._lib.pylibcudf.libcudf.strings.strip cimport strip as cpp_strip
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/substring.pyx b/python/cudf/cudf/_lib/strings/substring.pyx
index e6b8cdd28ee..170c1016b89 100644
--- a/python/cudf/cudf/_lib/strings/substring.pyx
+++ b/python/cudf/cudf/_lib/strings/substring.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import numpy as np
 
@@ -8,14 +8,16 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.substring cimport slice_strings as cpp_slice_strings
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.substring cimport (
+    slice_strings as cpp_slice_strings,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.scalar import as_device_scalar
 
-from cudf._lib.cpp.scalar.scalar cimport numeric_scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/translate.pyx b/python/cudf/cudf/_lib/strings/translate.pyx
index 55659e98dcb..8846e2e280d 100644
--- a/python/cudf/cudf/_lib/strings/translate.pyx
+++ b/python/cudf/cudf/_lib/strings/translate.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -9,15 +9,15 @@ from libcpp.vector cimport vector
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.translate cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.translate cimport (
     filter_characters as cpp_filter_characters,
     filter_type,
     translate as cpp_translate,
 )
-from cudf._lib.cpp.types cimport char_utf8
+from cudf._lib.pylibcudf.libcudf.types cimport char_utf8
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/wrap.pyx b/python/cudf/cudf/_lib/strings/wrap.pyx
index 8b0c367e791..92750f21e4d 100644
--- a/python/cudf/cudf/_lib/strings/wrap.pyx
+++ b/python/cudf/cudf/_lib/strings/wrap.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,10 +6,10 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.wrap cimport wrap as cpp_wrap
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.wrap cimport wrap as cpp_wrap
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings_udf.pyx b/python/cudf/cudf/_lib/strings_udf.pyx
index a59e6db1b72..e952492c45d 100644
--- a/python/cudf/cudf/_lib/strings_udf.pyx
+++ b/python/cudf/cudf/_lib/strings_udf.pyx
@@ -1,8 +1,8 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t, uint16_t, uintptr_t
 
-from cudf._lib.cpp.strings_udf cimport (
+from cudf._lib.pylibcudf.libcudf.strings_udf cimport (
     get_character_cases_table as cpp_get_character_cases_table,
     get_character_flags_table as cpp_get_character_flags_table,
     get_special_case_mapping_table as cpp_get_special_case_mapping_table,
@@ -18,8 +18,8 @@ from cudf.core.buffer import as_buffer
 from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column, column_view
-from cudf._lib.cpp.strings_udf cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_view
+from cudf._lib.pylibcudf.libcudf.strings_udf cimport (
     column_from_udf_string_array as cpp_column_from_udf_string_array,
     free_udf_string_array as cpp_free_udf_string_array,
     to_string_view_array as cpp_to_string_view_array,
diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx
index a7346cdd586..6e63b8758b8 100644
--- a/python/cudf/cudf/_lib/text.pyx
+++ b/python/cudf/cudf/_lib/text.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from io import TextIOBase
 
@@ -9,8 +9,8 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.io.text cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.io.text cimport (
     byte_range_info,
     data_chunk_source,
     make_source,
diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx
index 808d1321b0b..53977e984c2 100644
--- a/python/cudf/cudf/_lib/timezone.pyx
+++ b/python/cudf/cudf/_lib/timezone.pyx
@@ -1,14 +1,14 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.optional cimport make_optional
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from cudf._lib.cpp.io.timezone cimport (
+from cudf._lib.pylibcudf.libcudf.io.timezone cimport (
     make_timezone_transition_table as cpp_make_timezone_transition_table,
 )
-from cudf._lib.cpp.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.utils cimport columns_from_unique_ptr
 
 
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index d8eb6134042..b325173f20d 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from numba.np import numpy_support
 
@@ -17,15 +17,20 @@ from libcpp.utility cimport move
 
 from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
-cimport cudf._lib.cpp.transform as libcudf_transform
+cimport cudf._lib.pylibcudf.libcudf.transform as libcudf_transform
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.expressions cimport expression
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport bitmask_type, data_type, size_type, type_id
 from cudf._lib.expressions cimport Expression
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.expressions cimport expression
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    bitmask_type,
+    data_type,
+    size_type,
+    type_id,
+)
 from cudf._lib.types cimport underlying_type_t_type_id
 from cudf._lib.utils cimport (
     columns_from_unique_ptr,
diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx
index 51e49b1f27a..82b23439e6a 100644
--- a/python/cudf/cudf/_lib/transpose.pyx
+++ b/python/cudf/cudf/_lib/transpose.pyx
@@ -1,13 +1,13 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.transpose cimport transpose as cpp_transpose
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.transpose cimport transpose as cpp_transpose
 from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns
 
 
diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd
index a95db84ceff..519d5ff8554 100644
--- a/python/cudf/cudf/_lib/types.pxd
+++ b/python/cudf/cudf/_lib/types.pxd
@@ -1,11 +1,13 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
 from libcpp cimport bool
 
-cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
 
 ctypedef bool underlying_type_t_order
 ctypedef bool underlying_type_t_null_order
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index 1b4f4617e97..895e1afc502 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -7,9 +7,11 @@ import pandas as pd
 
 from libcpp.memory cimport make_shared, shared_ptr
 
-cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
 from cudf._lib.types cimport (
     underlying_type_t_interpolation,
     underlying_type_t_order,
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index 51c69bdcaf9..c5a1e7552b9 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -4,8 +4,8 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column_view
-from cudf._lib.cpp.table.table cimport table, table_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table, table_view
 
 
 cdef data_from_unique_ptr(
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 0afecb215e4..4c4cd48d6ed 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -11,10 +11,10 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column, column_view
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 try:
     import ujson as json
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
index 068837d04ee..84a3a32646d 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool
@@ -7,8 +7,8 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.io.datasource cimport datasource
 from cudf._lib.io.datasource cimport Datasource
+from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
 
 
 cdef extern from "cudf_kafka/kafka_callback.hpp" \
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
index 2fbaacff7c6..2927dc0aa9a 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool, nullptr
@@ -7,7 +7,7 @@ from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from cudf._lib.cpp.io.datasource cimport datasource
+from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
 
 from cudf_kafka._lib.kafka cimport kafka_consumer
 

From 0fea3ed7e649ec8acf23ae91edf2058fe7d9e77e Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 14 May 2024 20:21:13 -0700
Subject: [PATCH 220/272] Fix arrow versioning logic (#15755)

Resolves #15754

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15755
---
 cpp/cmake/thirdparty/get_arrow.cmake | 21 ++++++++++++++-------
 dependencies.yaml                    |  4 ++--
 python/cudf/pyproject.toml           |  2 +-
 3 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index e9d2f479088..73e66cce608 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -26,13 +26,20 @@ include_guard(GLOBAL)
 # pyarrow.
 function(find_libarrow_in_python_wheel PYARROW_VERSION)
   string(REPLACE "." ";" PYARROW_VER_COMPONENTS "${PYARROW_VERSION}")
-  list(GET PYARROW_VER_COMPONENTS 0 PYARROW_SO_VER)
-  # The soname for Arrow libraries is constructed using the major version plus "00". Note that,
-  # although it may seem like it due to Arrow almost exclusively releasing new major versions (i.e.
-  # `${MINOR_VERSION}${PATCH_VERSION}` is almost always equivalent to "00"),
-  # the soname is not generated by concatenating the major, minor, and patch versions into a single
-  # version number soname, just `${MAJOR_VERSION}00`
-  set(PYARROW_LIB "libarrow.so.${PYARROW_SO_VER}00")
+  list(GET PYARROW_VER_COMPONENTS 0 PYARROW_MAJOR_VER)
+  list(GET PYARROW_VER_COMPONENTS 1 PYARROW_MINOR_VER)
+
+  # Ensure that the major and minor versions are two digits long
+  string(LENGTH ${PYARROW_MAJOR_VER} PYARROW_MAJOR_LENGTH)
+  string(LENGTH ${PYARROW_MINOR_VER} PYARROW_MINOR_LENGTH)
+  if(${PYARROW_MAJOR_LENGTH} EQUAL 1)
+    set(PYARROW_MAJOR_VER "0${PYARROW_MAJOR_VER}")
+  endif()
+  if(${PYARROW_MINOR_LENGTH} EQUAL 1)
+    set(PYARROW_MINOR_VER "0${PYARROW_MINOR_VER}")
+  endif()
+
+  set(PYARROW_LIB "libarrow.so.${PYARROW_MAJOR_VER}${PYARROW_MINOR_VER}")
 
   string(
     APPEND
diff --git a/dependencies.yaml b/dependencies.yaml
index 27b0f23389c..898760d1351 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -352,8 +352,8 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          # Allow runtime version to float up to minor version
-          - pyarrow>=16.0.0,<17.0.0a0
+          # Allow runtime version to float up to patch version
+          - pyarrow>=16.0.0,<16.1.0a0
   cuda_version:
     specific:
       - output_types: conda
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 4b57bcd018a..826362f0632 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -34,7 +34,7 @@ dependencies = [
     "packaging",
     "pandas>=2.0,<2.2.3dev0",
     "ptxcompiler",
-    "pyarrow>=16.0.0,<17.0.0a0",
+    "pyarrow>=16.0.0,<16.1.0a0",
     "rich",
     "rmm==24.6.*",
     "typing_extensions>=4.0.0",

From 4a6d13f232aba099b47ea3c95fa429209fcf863b Mon Sep 17 00:00:00 2001
From: Nick Becker <nickb500@gmail.com>
Date: Wed, 15 May 2024 00:26:37 -0400
Subject: [PATCH 221/272] Update cudf.pandas docs for GA (#15744)

cudf.pandas is now generally available (see the [RAPIDS 24.04 release blog](https://medium.com/rapids-ai/rapids-24-04-release-c11cf44c3e23#f263) for more information).

This PR updates the docs accordingly.

Authors:
  - Nick Becker (https://github.com/beckernick)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15744
---
 docs/cudf/source/cudf_pandas/index.rst | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/docs/cudf/source/cudf_pandas/index.rst b/docs/cudf/source/cudf_pandas/index.rst
index 628194cc8a5..f98c04cc383 100644
--- a/docs/cudf/source/cudf_pandas/index.rst
+++ b/docs/cudf/source/cudf_pandas/index.rst
@@ -34,10 +34,8 @@ automatically **falling back to pandas** for other operations.
 | Nothing changes, not even your `import` statements, when going from CPU to GPU.             | Combines the full flexibility of Pandas with blazing fast performance of cuDF                                       |
 +---------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------+
 
-Starting with the version 23.10.01 release ``cudf.pandas`` is
-available in Open Beta, as part of the ``cudf`` package .  See `RAPIDS
-Quick Start <https://rapids.ai/#quick-start>`_ to get up-and-running
-with ``cudf``.
+``cudf.pandas`` is now Generally Available (GA) as part of the ``cudf`` package.  See `RAPIDS
+Quick Start <https://rapids.ai/#quick-start>`_ to get up-and-running with ``cudf``.
 
 .. toctree::
    :maxdepth: 1

From 04d247c072b55ce8265d18c7e1a56e4abb31f6cf Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 15 May 2024 06:34:25 -0500
Subject: [PATCH 222/272] Handle empty dataframe object with index present in
 setitem of `loc` (#15752)

Fixes: #15718

This PR fixes an issue with `loc` setitem where the dataframe is empty but has an index of length greater than 0.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15752
---
 python/cudf/cudf/core/dataframe.py      | 4 ++--
 python/cudf/cudf/tests/test_indexing.py | 9 +++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index b29089cb81a..8442cf05f01 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -405,12 +405,12 @@ def _setitem_tuple_arg(self, key, value):
                 value = as_column(value, length=length)
 
             new_col = cudf.Series(value, index=idx)
-            if not self._frame.empty:
+            if len(self._frame.index) != 0:
                 new_col = new_col._align_to_index(
                     self._frame.index, how="right"
                 )
 
-            if self._frame.empty:
+            if len(self._frame.index) == 0:
                 self._frame.index = (
                     idx if idx is not None else cudf.RangeIndex(len(new_col))
                 )
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 5f5c4579e01..f49b9b02076 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -2255,3 +2255,12 @@ def test_scalar_loc_row_categoricalindex():
     result = df.loc["a"]
     expected = df.to_pandas().loc["a"]
     assert_eq(result, expected)
+
+
+def test_loc_setitem_empty_dataframe():
+    pdf = pd.DataFrame(index=["index_1", "index_2", "index_3"])
+    gdf = cudf.from_pandas(pdf)
+    pdf.loc[["index_1"], "new_col"] = "A"
+    gdf.loc[["index_1"], "new_col"] = "A"
+
+    assert_eq(pdf, gdf)

From fa9d028073f73218fe0dd4e49671c39fa11fc42c Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 15 May 2024 06:34:54 -0500
Subject: [PATCH 223/272] Allow `None` when `nan_as_null=False` in column
 constructor (#15709)

Fixes: #15708

This PR fixes an issue where we were throwing an error when `None` is present and `nan_as_null=False`, this is a bug because of using `pd.isna`, this returns `True` for `nan`, `None` and `NA`. Whereas we are only looking for `np.nan` and not `None` and `pd.NA`

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15709
---
 python/cudf/cudf/core/column/column.py   | 22 ++++++++++++--
 python/cudf/cudf/tests/test_dataframe.py | 38 +++++++-----------------
 python/cudf/cudf/tests/test_series.py    | 23 ++++++++++----
 3 files changed, 49 insertions(+), 34 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 371c91dd96f..1785eb834b2 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1411,6 +1411,13 @@ def column_empty_like(
     return column_empty(row_count, dtype, masked)
 
 
+def _has_any_nan(arbitrary):
+    return any(
+        ((isinstance(x, float) or isinstance(x, np.floating)) and np.isnan(x))
+        for x in np.asarray(arbitrary)
+    )
+
+
 def column_empty_like_same_mask(
     column: ColumnBase, dtype: Dtype
 ) -> ColumnBase:
@@ -1948,9 +1955,20 @@ def as_column(
                 raise TypeError(
                     f"Cannot convert a {inferred_dtype} of object type"
                 )
-            elif nan_as_null is False and (
-                pd.isna(arbitrary).any()
+            elif inferred_dtype == "boolean":
+                if cudf.get_option("mode.pandas_compatible"):
+                    if dtype != np.dtype("bool") or pd.isna(arbitrary).any():
+                        raise MixedTypeError(
+                            f"Cannot have mixed values with {inferred_dtype}"
+                        )
+                elif nan_as_null is False and _has_any_nan(arbitrary):
+                    raise MixedTypeError(
+                        f"Cannot have mixed values with {inferred_dtype}"
+                    )
+            elif (
+                nan_as_null is False
                 and inferred_dtype not in ("decimal", "empty")
+                and _has_any_nan(arbitrary)
             ):
                 # Decimal can hold float("nan")
                 # All np.nan is not restricted by type
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 96301670e9c..8b18e53d320 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -4008,44 +4008,28 @@ def test_diff(dtype, period, data_empty):
 
 @pytest.mark.parametrize("df", _dataframe_na_data())
 @pytest.mark.parametrize("nan_as_null", [True, False, None])
-def test_dataframe_isnull_isna(df, nan_as_null):
-    if nan_as_null is False and (
-        df.select_dtypes(object).isna().any().any()
-        and not df.select_dtypes(object).isna().all().all()
-    ):
-        with pytest.raises(MixedTypeError):
-            cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
-    else:
-        gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
+@pytest.mark.parametrize("api_call", ["isnull", "isna", "notna", "notnull"])
+def test_dataframe_isnull_isna_and_reverse(df, nan_as_null, api_call):
+    def detect_nan(x):
+        # Check if the input is a float and if it is nan
+        return x.apply(lambda v: isinstance(v, float) and np.isnan(v))
 
-        assert_eq(df.isnull(), gdf.isnull())
-        assert_eq(df.isna(), gdf.isna())
-
-        # Test individual columns
-        for col in df:
-            assert_eq(df[col].isnull(), gdf[col].isnull())
-            assert_eq(df[col].isna(), gdf[col].isna())
-
-
-@pytest.mark.parametrize("df", _dataframe_na_data())
-@pytest.mark.parametrize("nan_as_null", [True, False, None])
-def test_dataframe_notna_notnull(df, nan_as_null):
+    nan_contains = df.select_dtypes(object).apply(detect_nan)
     if nan_as_null is False and (
-        df.select_dtypes(object).isna().any().any()
-        and not df.select_dtypes(object).isna().all().all()
+        nan_contains.any().any() and not nan_contains.all().all()
     ):
         with pytest.raises(MixedTypeError):
             cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
     else:
         gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
 
-        assert_eq(df.notnull(), gdf.notnull())
-        assert_eq(df.notna(), gdf.notna())
+        assert_eq(getattr(df, api_call)(), getattr(gdf, api_call)())
 
         # Test individual columns
         for col in df:
-            assert_eq(df[col].notnull(), gdf[col].notnull())
-            assert_eq(df[col].notna(), gdf[col].notna())
+            assert_eq(
+                getattr(df[col], api_call)(), getattr(gdf[col], api_call)()
+            )
 
 
 def test_ndim():
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 08a6173d3f5..9aeae566730 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -774,8 +774,9 @@ def test_round_nan_as_null_false(series, decimal):
 @pytest.mark.parametrize("ps", _series_na_data())
 @pytest.mark.parametrize("nan_as_null", [True, False, None])
 def test_series_isnull_isna(ps, nan_as_null):
+    nan_contains = ps.apply(lambda x: isinstance(x, float) and np.isnan(x))
     if nan_as_null is False and (
-        ps.isna().any() and not ps.isna().all() and ps.dtype == object
+        nan_contains.any() and not nan_contains.all() and ps.dtype == object
     ):
         with pytest.raises(MixedTypeError):
             cudf.Series.from_pandas(ps, nan_as_null=nan_as_null)
@@ -789,8 +790,9 @@ def test_series_isnull_isna(ps, nan_as_null):
 @pytest.mark.parametrize("ps", _series_na_data())
 @pytest.mark.parametrize("nan_as_null", [True, False, None])
 def test_series_notnull_notna(ps, nan_as_null):
+    nan_contains = ps.apply(lambda x: isinstance(x, float) and np.isnan(x))
     if nan_as_null is False and (
-        ps.isna().any() and not ps.isna().all() and ps.dtype == object
+        nan_contains.any() and not nan_contains.all() and ps.dtype == object
     ):
         with pytest.raises(MixedTypeError):
             cudf.Series.from_pandas(ps, nan_as_null=nan_as_null)
@@ -2356,12 +2358,23 @@ def test_multi_dim_series_error():
 
 def test_bool_series_mixed_dtype_error():
     ps = pd.Series([True, False, None])
+    all_bool_ps = pd.Series([True, False, True], dtype="object")
     # ps now has `object` dtype, which
     # isn't supported by `cudf`.
+    with cudf.option_context("mode.pandas_compatible", True):
+        with pytest.raises(TypeError):
+            cudf.Series(ps)
+        with pytest.raises(TypeError):
+            cudf.from_pandas(ps)
+        with pytest.raises(TypeError):
+            cudf.Series(ps, dtype=bool)
+        expected = cudf.Series(all_bool_ps, dtype=bool)
+        assert_eq(expected, all_bool_ps.astype(bool))
+    nan_bools_mix = pd.Series([True, False, True, np.nan], dtype="object")
+    gs = cudf.Series(nan_bools_mix, nan_as_null=True)
+    assert_eq(gs.to_pandas(nullable=True), nan_bools_mix.astype("boolean"))
     with pytest.raises(TypeError):
-        cudf.Series(ps, nan_as_null=False)
-    with pytest.raises(TypeError):
-        cudf.from_pandas(ps, nan_as_null=False)
+        cudf.Series(nan_bools_mix, nan_as_null=False)
 
 
 @pytest.mark.parametrize(

From c5c95b74b4a72884865f694129586ede8cb08de3 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 15 May 2024 09:17:47 -0700
Subject: [PATCH 224/272] Support `DurationType` in cudf parquet reader via
 `arrow:schema` (#15617)

This PR adds the support for reading and using the `arrow:schema` struct from the serialized `arrow:ipc` message written at the key-value metadata section of the Parquet file with `ARROW:schema` key. This allows cudf to read and interop with arrow for non-standard parquet types (`DurationType` in this PR).

Arrow uses Google flatbuffers (inside Schema.fbs) to serialize the `arrow:Schema` structure (containing column descriptors) and puts it (padded for 8 byte alignment) into the header of an empty `ipc:Message` (also a flatbuffer-serialized structure inside Message.fbs). The `ipc:Message` is prepended with two integers containing a `validity` message and the `size of the header` (the `arrow:Schema` + padding). The final message is endoded as a base64 string and written to Parquet file footer key-value metadata using `"ARROW:schema"` key.

In this PR, we base64-decode the `ipc:Message`, then we decode the `validity` message and the header size, and offset pointers to the `arrow:Schema` flatbuffer. We then use Flatbuffer structs to walk the `arrow:Schema` and collect information on columns of interest as an unordered_map (using column name as key).  This unordered_map is used inside `select_columns` function to build cudf Table columns and get the correct `dtype`.

Closes #13410

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vukasin Milovanovic (https://github.com/vuule)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Vukasin Milovanovic (https://github.com/vuule)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15617
---
 .pre-commit-config.yaml                       |    8 +-
 cpp/CMakeLists.txt                            |    4 +
 cpp/cmake/thirdparty/get_flatbuffers.cmake    |   33 +
 cpp/include/cudf/io/parquet.hpp               |   28 +
 cpp/src/io/parquet/ipc/Message_generated.h    |  651 ++++
 cpp/src/io/parquet/ipc/Schema_generated.h     | 2769 +++++++++++++++++
 cpp/src/io/parquet/ipc/schema/Message.fbs     |  176 ++
 cpp/src/io/parquet/ipc/schema/Schema.fbs      |  591 ++++
 cpp/src/io/parquet/parquet.hpp                |    3 +
 cpp/src/io/parquet/reader_impl.cpp            |    8 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp    |  336 +-
 cpp/src/io/parquet/reader_impl_helpers.hpp    |   29 +-
 cpp/src/io/utilities/base64_utilities.cpp     |  234 ++
 cpp/src/io/utilities/base64_utilities.hpp     |   87 +
 .../utilities_tests/io_utilities_tests.cpp    |  116 +
 python/cudf/cudf/_lib/parquet.pyx             |    1 +
 .../_lib/pylibcudf/libcudf/io/parquet.pxd     |    5 +
 python/cudf/cudf/tests/test_parquet.py        |   88 +
 18 files changed, 5152 insertions(+), 15 deletions(-)
 create mode 100644 cpp/cmake/thirdparty/get_flatbuffers.cmake
 create mode 100644 cpp/src/io/parquet/ipc/Message_generated.h
 create mode 100644 cpp/src/io/parquet/ipc/Schema_generated.h
 create mode 100644 cpp/src/io/parquet/ipc/schema/Message.fbs
 create mode 100644 cpp/src/io/parquet/ipc/schema/Schema.fbs
 create mode 100644 cpp/src/io/utilities/base64_utilities.cpp
 create mode 100644 cpp/src/io/utilities/base64_utilities.hpp

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d44462236b2..5a8d9f54673 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -143,9 +143,11 @@ repos:
     hooks:
       - id: verify-copyright
         exclude: |
-          (?x)
-              cpp/include/cudf_test/cxxopts[.]hpp$
-
+          (?x)^(
+            cpp/include/cudf_test/cxxopts[.]hpp$|
+            cpp/src/io/parquet/ipc/Message_generated[.]h$|
+            cpp/src/io/parquet/ipc/Schema_generated[.]h$
+          )
 
 default_language_version:
       python: python3
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f11f3fc3c9a..474269364de 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -192,6 +192,8 @@ include(cmake/thirdparty/get_cccl.cmake)
 include(cmake/thirdparty/get_rmm.cmake)
 # find arrow
 include(cmake/thirdparty/get_arrow.cmake)
+# find flatbuffers
+include(cmake/thirdparty/get_flatbuffers.cmake)
 # find dlpack
 include(cmake/thirdparty/get_dlpack.cmake)
 # find cuCollections, should come after including CCCL
@@ -429,6 +431,7 @@ add_library(
   src/io/text/bgzip_utils.cpp
   src/io/text/multibyte_split.cu
   src/io/utilities/arrow_io_source.cpp
+  src/io/utilities/base64_utilities.cpp
   src/io/utilities/column_buffer.cpp
   src/io/utilities/column_buffer_strings.cu
   src/io/utilities/config_utils.cpp
@@ -742,6 +745,7 @@ target_include_directories(
          "$<BUILD_INTERFACE:${CUDF_GENERATED_INCLUDE_DIR}/include>"
   PRIVATE "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
           "$<BUILD_INTERFACE:${nanoarrow_SOURCE_DIR}/src>"
+          "$<BUILD_INTERFACE:${FlatBuffers_SOURCE_DIR}/include>"
   INTERFACE "$<INSTALL_INTERFACE:include>"
 )
 
diff --git a/cpp/cmake/thirdparty/get_flatbuffers.cmake b/cpp/cmake/thirdparty/get_flatbuffers.cmake
new file mode 100644
index 00000000000..b0ece38b8ef
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_flatbuffers.cmake
@@ -0,0 +1,33 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# Use CPM to find or clone flatbuffers
+function(find_and_configure_flatbuffers VERSION)
+
+  rapids_cpm_find(
+    flatbuffers ${VERSION}
+    GLOBAL_TARGETS flatbuffers
+    CPM_ARGS
+    GIT_REPOSITORY https://github.com/google/flatbuffers.git
+    GIT_TAG v${VERSION}
+    GIT_SHALLOW TRUE
+  )
+
+  rapids_export_find_package_root(
+    BUILD flatbuffers "${flatbuffers_BINARY_DIR}" EXPORT_SET cudf-exports
+  )
+
+endfunction()
+
+find_and_configure_flatbuffers(24.3.25)
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 8bfcacdb47f..7f034668e43 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -71,6 +71,8 @@ class parquet_reader_options {
   bool _convert_strings_to_categories = false;
   // Whether to use PANDAS metadata to load columns
   bool _use_pandas_metadata = true;
+  // Whether to read and use ARROW schema
+  bool _use_arrow_schema = true;
   // Cast timestamp columns to a specific type
   data_type _timestamp_type{type_id::EMPTY};
 
@@ -126,6 +128,13 @@ class parquet_reader_options {
    */
   [[nodiscard]] bool is_enabled_use_pandas_metadata() const { return _use_pandas_metadata; }
 
+  /**
+   * @brief Returns true/false depending whether to use arrow schema while reading.
+   *
+   * @return `true` if arrow schema is used while reading
+   */
+  [[nodiscard]] bool is_enabled_use_arrow_schema() const { return _use_arrow_schema; }
+
   /**
    * @brief Returns optional tree of metadata.
    *
@@ -214,6 +223,13 @@ class parquet_reader_options {
    */
   void enable_use_pandas_metadata(bool val) { _use_pandas_metadata = val; }
 
+  /**
+   * @brief Sets to enable/disable use of arrow schema to read.
+   *
+   * @param val Boolean value whether to use arrow schema
+   */
+  void enable_use_arrow_schema(bool val) { _use_arrow_schema = val; }
+
   /**
    * @brief Sets reader column schema.
    *
@@ -328,6 +344,18 @@ class parquet_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets to enable/disable use of arrow schema to read.
+   *
+   * @param val Boolean value whether to use arrow schema
+   * @return this for chaining
+   */
+  parquet_reader_options_builder& use_arrow_schema(bool val)
+  {
+    options._use_arrow_schema = val;
+    return *this;
+  }
+
   /**
    * @brief Sets reader metadata.
    *
diff --git a/cpp/src/io/parquet/ipc/Message_generated.h b/cpp/src/io/parquet/ipc/Message_generated.h
new file mode 100644
index 00000000000..8ddd859f51c
--- /dev/null
+++ b/cpp/src/io/parquet/ipc/Message_generated.h
@@ -0,0 +1,651 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+#ifndef FLATBUFFERS_GENERATED_MESSAGE_CUDF_IO_PARQUET_FLATBUF_H_
+#define FLATBUFFERS_GENERATED_MESSAGE_CUDF_IO_PARQUET_FLATBUF_H_
+
+#include <flatbuffers/flatbuffers.h>
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 && FLATBUFFERS_VERSION_MINOR == 3 &&
+                FLATBUFFERS_VERSION_REVISION == 25,
+              "Non-compatible flatbuffers version included");
+
+#include "Schema_generated.h"
+
+namespace cudf {
+namespace io {
+namespace parquet {
+namespace flatbuf {
+
+struct FieldNode;
+
+struct BodyCompression;
+struct BodyCompressionBuilder;
+
+struct RecordBatch;
+struct RecordBatchBuilder;
+
+struct DictionaryBatch;
+struct DictionaryBatchBuilder;
+
+struct Message;
+struct MessageBuilder;
+
+enum CompressionType : int8_t {
+  CompressionType_LZ4_FRAME = 0,
+  CompressionType_ZSTD      = 1,
+  CompressionType_MIN       = CompressionType_LZ4_FRAME,
+  CompressionType_MAX       = CompressionType_ZSTD
+};
+
+inline const CompressionType (&EnumValuesCompressionType())[2]
+{
+  static const CompressionType values[] = {CompressionType_LZ4_FRAME, CompressionType_ZSTD};
+  return values;
+}
+
+inline const char* const* EnumNamesCompressionType()
+{
+  static const char* const names[3] = {"LZ4_FRAME", "ZSTD", nullptr};
+  return names;
+}
+
+inline const char* EnumNameCompressionType(CompressionType e)
+{
+  if (::flatbuffers::IsOutRange(e, CompressionType_LZ4_FRAME, CompressionType_ZSTD)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesCompressionType()[index];
+}
+
+/// Provided for forward compatibility in case we need to support different
+/// strategies for compressing the IPC message body (like whole-body
+/// compression rather than buffer-level) in the future
+enum BodyCompressionMethod : int8_t {
+  /// Each constituent buffer is first compressed with the indicated
+  /// compressor, and then written with the uncompressed length in the first 8
+  /// bytes as a 64-bit little-endian signed integer followed by the compressed
+  /// buffer bytes (and then padding as required by the protocol). The
+  /// uncompressed length may be set to -1 to indicate that the data that
+  /// follows is not compressed, which can be useful for cases where
+  /// compression does not yield appreciable savings.
+  BodyCompressionMethod_BUFFER = 0,
+  BodyCompressionMethod_MIN    = BodyCompressionMethod_BUFFER,
+  BodyCompressionMethod_MAX    = BodyCompressionMethod_BUFFER
+};
+
+inline const BodyCompressionMethod (&EnumValuesBodyCompressionMethod())[1]
+{
+  static const BodyCompressionMethod values[] = {BodyCompressionMethod_BUFFER};
+  return values;
+}
+
+inline const char* const* EnumNamesBodyCompressionMethod()
+{
+  static const char* const names[2] = {"BUFFER", nullptr};
+  return names;
+}
+
+inline const char* EnumNameBodyCompressionMethod(BodyCompressionMethod e)
+{
+  if (::flatbuffers::IsOutRange(e, BodyCompressionMethod_BUFFER, BodyCompressionMethod_BUFFER))
+    return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesBodyCompressionMethod()[index];
+}
+
+/// ----------------------------------------------------------------------
+/// The root Message type
+/// This union enables us to easily send different message types without
+/// redundant storage, and in the future we can easily add new message types.
+///
+/// Arrow implementations do not need to implement all of the message types,
+/// which may include experimental metadata types. For maximum compatibility,
+/// it is best to send data using RecordBatch
+enum MessageHeader : uint8_t {
+  MessageHeader_NONE   = 0,
+  MessageHeader_Schema = 1,
+  MessageHeader_MIN    = MessageHeader_NONE,
+  MessageHeader_MAX    = MessageHeader_Schema
+};
+
+inline const MessageHeader (&EnumValuesMessageHeader())[2]
+{
+  static const MessageHeader values[] = {MessageHeader_NONE, MessageHeader_Schema};
+  return values;
+}
+
+inline const char* const* EnumNamesMessageHeader()
+{
+  static const char* const names[3] = {"NONE", "Schema", nullptr};
+  return names;
+}
+
+inline const char* EnumNameMessageHeader(MessageHeader e)
+{
+  if (::flatbuffers::IsOutRange(e, MessageHeader_NONE, MessageHeader_Schema)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesMessageHeader()[index];
+}
+
+template <typename T>
+struct MessageHeaderTraits {
+  static const MessageHeader enum_value = MessageHeader_NONE;
+};
+
+template <>
+struct MessageHeaderTraits<cudf::io::parquet::flatbuf::Schema> {
+  static const MessageHeader enum_value = MessageHeader_Schema;
+};
+
+bool VerifyMessageHeader(::flatbuffers::Verifier& verifier, const void* obj, MessageHeader type);
+bool VerifyMessageHeaderVector(::flatbuffers::Verifier& verifier,
+                               const ::flatbuffers::Vector<::flatbuffers::Offset<void>>* values,
+                               const ::flatbuffers::Vector<uint8_t>* types);
+
+/// ----------------------------------------------------------------------
+/// Data structures for describing a table row batch (a collection of
+/// equal-length Arrow arrays)
+/// Metadata about a field at some level of a nested type tree (but not
+/// its children).
+///
+/// For example, a List<Int16> with values `[[1, 2, 3], null, [4], [5, 6], null]`
+/// would have {length: 5, null_count: 2} for its List node, and {length: 6,
+/// null_count: 0} for its Int16 node, as separate FieldNode structs
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) FieldNode FLATBUFFERS_FINAL_CLASS
+{
+ private:
+  int64_t length_;
+  int64_t null_count_;
+
+ public:
+  FieldNode() : length_(0), null_count_(0) {}
+  FieldNode(int64_t _length, int64_t _null_count)
+    : length_(::flatbuffers::EndianScalar(_length)),
+      null_count_(::flatbuffers::EndianScalar(_null_count))
+  {
+  }
+  /// The number of value slots in the Arrow array at this level of a nested
+  /// tree
+  int64_t length() const { return ::flatbuffers::EndianScalar(length_); }
+  /// The number of observed nulls. Fields with null_count == 0 may choose not
+  /// to write their physical validity bitmap out as a materialized buffer,
+  /// instead setting the length of the bitmap buffer to 0.
+  int64_t null_count() const { return ::flatbuffers::EndianScalar(null_count_); }
+};
+FLATBUFFERS_STRUCT_END(FieldNode, 16);
+
+/// Optional compression for the memory buffers constituting IPC message
+/// bodies. Intended for use with RecordBatch but could be used for other
+/// message types
+struct BodyCompression FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BodyCompressionBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_CODEC = 4, VT_METHOD = 6 };
+  /// Compressor library.
+  /// For LZ4_FRAME, each compressed buffer must consist of a single frame.
+  cudf::io::parquet::flatbuf::CompressionType codec() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::CompressionType>(GetField<int8_t>(VT_CODEC, 0));
+  }
+  /// Indicates the way the record batch body was compressed
+  cudf::io::parquet::flatbuf::BodyCompressionMethod method() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::BodyCompressionMethod>(
+      GetField<int8_t>(VT_METHOD, 0));
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int8_t>(verifier, VT_CODEC, 1) &&
+           VerifyField<int8_t>(verifier, VT_METHOD, 1) && verifier.EndTable();
+  }
+};
+
+struct BodyCompressionBuilder {
+  typedef BodyCompression Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_codec(cudf::io::parquet::flatbuf::CompressionType codec)
+  {
+    fbb_.AddElement<int8_t>(BodyCompression::VT_CODEC, static_cast<int8_t>(codec), 0);
+  }
+  void add_method(cudf::io::parquet::flatbuf::BodyCompressionMethod method)
+  {
+    fbb_.AddElement<int8_t>(BodyCompression::VT_METHOD, static_cast<int8_t>(method), 0);
+  }
+  explicit BodyCompressionBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BodyCompression> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<BodyCompression>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BodyCompression> CreateBodyCompression(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::CompressionType codec =
+    cudf::io::parquet::flatbuf::CompressionType_LZ4_FRAME,
+  cudf::io::parquet::flatbuf::BodyCompressionMethod method =
+    cudf::io::parquet::flatbuf::BodyCompressionMethod_BUFFER)
+{
+  BodyCompressionBuilder builder_(_fbb);
+  builder_.add_method(method);
+  builder_.add_codec(codec);
+  return builder_.Finish();
+}
+
+/// A data header describing the shared memory layout of a "record" or "row"
+/// batch. Some systems call this a "row batch" internally and others a "record
+/// batch".
+struct RecordBatch FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef RecordBatchBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_LENGTH               = 4,
+    VT_NODES                = 6,
+    VT_BUFFERS              = 8,
+    VT_COMPRESSION          = 10,
+    VT_VARIADICBUFFERCOUNTS = 12
+  };
+  /// number of records / rows. The arrays in the batch should all have this
+  /// length
+  int64_t length() const { return GetField<int64_t>(VT_LENGTH, 0); }
+  /// Nodes correspond to the pre-ordered flattened logical schema
+  const ::flatbuffers::Vector<const cudf::io::parquet::flatbuf::FieldNode*>* nodes() const
+  {
+    return GetPointer<const ::flatbuffers::Vector<const cudf::io::parquet::flatbuf::FieldNode*>*>(
+      VT_NODES);
+  }
+  /// Buffers correspond to the pre-ordered flattened buffer tree
+  ///
+  /// The number of buffers appended to this list depends on the schema. For
+  /// example, most primitive arrays will have 2 buffers, 1 for the validity
+  /// bitmap and 1 for the values. For struct arrays, there will only be a
+  /// single buffer for the validity (nulls) bitmap
+  const ::flatbuffers::Vector<const cudf::io::parquet::flatbuf::Buffer*>* buffers() const
+  {
+    return GetPointer<const ::flatbuffers::Vector<const cudf::io::parquet::flatbuf::Buffer*>*>(
+      VT_BUFFERS);
+  }
+  /// Optional compression of the message body
+  const cudf::io::parquet::flatbuf::BodyCompression* compression() const
+  {
+    return GetPointer<const cudf::io::parquet::flatbuf::BodyCompression*>(VT_COMPRESSION);
+  }
+  /// Some types such as Utf8View are represented using a variable number of buffers.
+  /// For each such Field in the pre-ordered flattened logical schema, there will be
+  /// an entry in variadicBufferCounts to indicate the number of number of variadic
+  /// buffers which belong to that Field in the current RecordBatch.
+  ///
+  /// For example, the schema
+  ///     col1: Struct<alpha: Int32, beta: BinaryView, gamma: Float64>
+  ///     col2: Utf8View
+  /// contains two Fields with variadic buffers so variadicBufferCounts will have
+  /// two entries, the first counting the variadic buffers of `col1.beta` and the
+  /// second counting `col2`'s.
+  ///
+  /// This field may be omitted if and only if the schema contains no Fields with
+  /// a variable number of buffers, such as BinaryView and Utf8View.
+  const ::flatbuffers::Vector<int64_t>* variadicBufferCounts() const
+  {
+    return GetPointer<const ::flatbuffers::Vector<int64_t>*>(VT_VARIADICBUFFERCOUNTS);
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int64_t>(verifier, VT_LENGTH, 8) &&
+           VerifyOffset(verifier, VT_NODES) && verifier.VerifyVector(nodes()) &&
+           VerifyOffset(verifier, VT_BUFFERS) && verifier.VerifyVector(buffers()) &&
+           VerifyOffset(verifier, VT_COMPRESSION) && verifier.VerifyTable(compression()) &&
+           VerifyOffset(verifier, VT_VARIADICBUFFERCOUNTS) &&
+           verifier.VerifyVector(variadicBufferCounts()) && verifier.EndTable();
+  }
+};
+
+struct RecordBatchBuilder {
+  typedef RecordBatch Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_length(int64_t length) { fbb_.AddElement<int64_t>(RecordBatch::VT_LENGTH, length, 0); }
+  void add_nodes(
+    ::flatbuffers::Offset<::flatbuffers::Vector<const cudf::io::parquet::flatbuf::FieldNode*>>
+      nodes)
+  {
+    fbb_.AddOffset(RecordBatch::VT_NODES, nodes);
+  }
+  void add_buffers(
+    ::flatbuffers::Offset<::flatbuffers::Vector<const cudf::io::parquet::flatbuf::Buffer*>> buffers)
+  {
+    fbb_.AddOffset(RecordBatch::VT_BUFFERS, buffers);
+  }
+  void add_compression(
+    ::flatbuffers::Offset<cudf::io::parquet::flatbuf::BodyCompression> compression)
+  {
+    fbb_.AddOffset(RecordBatch::VT_COMPRESSION, compression);
+  }
+  void add_variadicBufferCounts(
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> variadicBufferCounts)
+  {
+    fbb_.AddOffset(RecordBatch::VT_VARIADICBUFFERCOUNTS, variadicBufferCounts);
+  }
+  explicit RecordBatchBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<RecordBatch> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<RecordBatch>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<RecordBatch> CreateRecordBatch(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  int64_t length = 0,
+  ::flatbuffers::Offset<::flatbuffers::Vector<const cudf::io::parquet::flatbuf::FieldNode*>> nodes =
+    0,
+  ::flatbuffers::Offset<::flatbuffers::Vector<const cudf::io::parquet::flatbuf::Buffer*>> buffers =
+    0,
+  ::flatbuffers::Offset<cudf::io::parquet::flatbuf::BodyCompression> compression = 0,
+  ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> variadicBufferCounts     = 0)
+{
+  RecordBatchBuilder builder_(_fbb);
+  builder_.add_length(length);
+  builder_.add_variadicBufferCounts(variadicBufferCounts);
+  builder_.add_compression(compression);
+  builder_.add_buffers(buffers);
+  builder_.add_nodes(nodes);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<RecordBatch> CreateRecordBatchDirect(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  int64_t length                                                                 = 0,
+  const std::vector<cudf::io::parquet::flatbuf::FieldNode>* nodes                = nullptr,
+  const std::vector<cudf::io::parquet::flatbuf::Buffer>* buffers                 = nullptr,
+  ::flatbuffers::Offset<cudf::io::parquet::flatbuf::BodyCompression> compression = 0,
+  const std::vector<int64_t>* variadicBufferCounts                               = nullptr)
+{
+  auto nodes__ =
+    nodes ? _fbb.CreateVectorOfStructs<cudf::io::parquet::flatbuf::FieldNode>(*nodes) : 0;
+  auto buffers__ =
+    buffers ? _fbb.CreateVectorOfStructs<cudf::io::parquet::flatbuf::Buffer>(*buffers) : 0;
+  auto variadicBufferCounts__ =
+    variadicBufferCounts ? _fbb.CreateVector<int64_t>(*variadicBufferCounts) : 0;
+  return cudf::io::parquet::flatbuf::CreateRecordBatch(
+    _fbb, length, nodes__, buffers__, compression, variadicBufferCounts__);
+}
+
+/// For sending dictionary encoding information. Any Field can be
+/// dictionary-encoded, but in this case none of its children may be
+/// dictionary-encoded.
+/// There is one vector / column per dictionary, but that vector / column
+/// may be spread across multiple dictionary batches by using the isDelta
+/// flag
+struct DictionaryBatch FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DictionaryBatchBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ID      = 4,
+    VT_DATA    = 6,
+    VT_ISDELTA = 8
+  };
+  int64_t id() const { return GetField<int64_t>(VT_ID, 0); }
+  const cudf::io::parquet::flatbuf::RecordBatch* data() const
+  {
+    return GetPointer<const cudf::io::parquet::flatbuf::RecordBatch*>(VT_DATA);
+  }
+  /// If isDelta is true the values in the dictionary are to be appended to a
+  /// dictionary with the indicated id. If isDelta is false this dictionary
+  /// should replace the existing dictionary.
+  bool isDelta() const { return GetField<uint8_t>(VT_ISDELTA, 0) != 0; }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int64_t>(verifier, VT_ID, 8) &&
+           VerifyOffset(verifier, VT_DATA) && verifier.VerifyTable(data()) &&
+           VerifyField<uint8_t>(verifier, VT_ISDELTA, 1) && verifier.EndTable();
+  }
+};
+
+struct DictionaryBatchBuilder {
+  typedef DictionaryBatch Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_id(int64_t id) { fbb_.AddElement<int64_t>(DictionaryBatch::VT_ID, id, 0); }
+  void add_data(::flatbuffers::Offset<cudf::io::parquet::flatbuf::RecordBatch> data)
+  {
+    fbb_.AddOffset(DictionaryBatch::VT_DATA, data);
+  }
+  void add_isDelta(bool isDelta)
+  {
+    fbb_.AddElement<uint8_t>(DictionaryBatch::VT_ISDELTA, static_cast<uint8_t>(isDelta), 0);
+  }
+  explicit DictionaryBatchBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DictionaryBatch> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<DictionaryBatch>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DictionaryBatch> CreateDictionaryBatch(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  int64_t id                                                          = 0,
+  ::flatbuffers::Offset<cudf::io::parquet::flatbuf::RecordBatch> data = 0,
+  bool isDelta                                                        = false)
+{
+  DictionaryBatchBuilder builder_(_fbb);
+  builder_.add_id(id);
+  builder_.add_data(data);
+  builder_.add_isDelta(isDelta);
+  return builder_.Finish();
+}
+
+struct Message FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MessageBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VERSION         = 4,
+    VT_HEADER_TYPE     = 6,
+    VT_HEADER          = 8,
+    VT_BODYLENGTH      = 10,
+    VT_CUSTOM_METADATA = 12
+  };
+  cudf::io::parquet::flatbuf::MetadataVersion version() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::MetadataVersion>(
+      GetField<int16_t>(VT_VERSION, 0));
+  }
+  cudf::io::parquet::flatbuf::MessageHeader header_type() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::MessageHeader>(
+      GetField<uint8_t>(VT_HEADER_TYPE, 0));
+  }
+  const void* header() const { return GetPointer<const void*>(VT_HEADER); }
+  template <typename T>
+  const T* header_as() const;
+  const cudf::io::parquet::flatbuf::Schema* header_as_Schema() const
+  {
+    return header_type() == cudf::io::parquet::flatbuf::MessageHeader_Schema
+             ? static_cast<const cudf::io::parquet::flatbuf::Schema*>(header())
+             : nullptr;
+  }
+  int64_t bodyLength() const { return GetField<int64_t>(VT_BODYLENGTH, 0); }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>*
+  custom_metadata() const
+  {
+    return GetPointer<
+      const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>*>(
+      VT_CUSTOM_METADATA);
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_VERSION, 2) &&
+           VerifyField<uint8_t>(verifier, VT_HEADER_TYPE, 1) && VerifyOffset(verifier, VT_HEADER) &&
+           VerifyMessageHeader(verifier, header(), header_type()) &&
+           VerifyField<int64_t>(verifier, VT_BODYLENGTH, 8) &&
+           VerifyOffset(verifier, VT_CUSTOM_METADATA) && verifier.VerifyVector(custom_metadata()) &&
+           verifier.VerifyVectorOfTables(custom_metadata()) && verifier.EndTable();
+  }
+};
+
+template <>
+inline const cudf::io::parquet::flatbuf::Schema*
+Message::header_as<cudf::io::parquet::flatbuf::Schema>() const
+{
+  return header_as_Schema();
+}
+
+struct MessageBuilder {
+  typedef Message Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_version(cudf::io::parquet::flatbuf::MetadataVersion version)
+  {
+    fbb_.AddElement<int16_t>(Message::VT_VERSION, static_cast<int16_t>(version), 0);
+  }
+  void add_header_type(cudf::io::parquet::flatbuf::MessageHeader header_type)
+  {
+    fbb_.AddElement<uint8_t>(Message::VT_HEADER_TYPE, static_cast<uint8_t>(header_type), 0);
+  }
+  void add_header(::flatbuffers::Offset<void> header)
+  {
+    fbb_.AddOffset(Message::VT_HEADER, header);
+  }
+  void add_bodyLength(int64_t bodyLength)
+  {
+    fbb_.AddElement<int64_t>(Message::VT_BODYLENGTH, bodyLength, 0);
+  }
+  void add_custom_metadata(
+    ::flatbuffers::Offset<
+      ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>>
+      custom_metadata)
+  {
+    fbb_.AddOffset(Message::VT_CUSTOM_METADATA, custom_metadata);
+  }
+  explicit MessageBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Message> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Message>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Message> CreateMessage(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::MetadataVersion version =
+    cudf::io::parquet::flatbuf::MetadataVersion_V1,
+  cudf::io::parquet::flatbuf::MessageHeader header_type =
+    cudf::io::parquet::flatbuf::MessageHeader_NONE,
+  ::flatbuffers::Offset<void> header                                              = 0,
+  int64_t bodyLength                                                              = 0,
+  ::flatbuffers::Offset<::flatbuffers::Vector<
+    ::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>> custom_metadata = 0)
+{
+  MessageBuilder builder_(_fbb);
+  builder_.add_bodyLength(bodyLength);
+  builder_.add_custom_metadata(custom_metadata);
+  builder_.add_header(header);
+  builder_.add_version(version);
+  builder_.add_header_type(header_type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Message> CreateMessageDirect(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::MetadataVersion version =
+    cudf::io::parquet::flatbuf::MetadataVersion_V1,
+  cudf::io::parquet::flatbuf::MessageHeader header_type =
+    cudf::io::parquet::flatbuf::MessageHeader_NONE,
+  ::flatbuffers::Offset<void> header = 0,
+  int64_t bodyLength                 = 0,
+  const std::vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>* custom_metadata =
+    nullptr)
+{
+  auto custom_metadata__ =
+    custom_metadata
+      ? _fbb.CreateVector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>(
+          *custom_metadata)
+      : 0;
+  return cudf::io::parquet::flatbuf::CreateMessage(
+    _fbb, version, header_type, header, bodyLength, custom_metadata__);
+}
+
+inline bool VerifyMessageHeader(::flatbuffers::Verifier& verifier,
+                                const void* obj,
+                                MessageHeader type)
+{
+  switch (type) {
+    case MessageHeader_NONE: {
+      return true;
+    }
+    case MessageHeader_Schema: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Schema*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyMessageHeaderVector(
+  ::flatbuffers::Verifier& verifier,
+  const ::flatbuffers::Vector<::flatbuffers::Offset<void>>* values,
+  const ::flatbuffers::Vector<uint8_t>* types)
+{
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyMessageHeader(verifier, values->Get(i), types->GetEnum<MessageHeader>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline const cudf::io::parquet::flatbuf::Message* GetMessage(const void* buf)
+{
+  return ::flatbuffers::GetRoot<cudf::io::parquet::flatbuf::Message>(buf);
+}
+
+inline const cudf::io::parquet::flatbuf::Message* GetSizePrefixedMessage(const void* buf)
+{
+  return ::flatbuffers::GetSizePrefixedRoot<cudf::io::parquet::flatbuf::Message>(buf);
+}
+
+inline bool VerifyMessageBuffer(::flatbuffers::Verifier& verifier)
+{
+  return verifier.VerifyBuffer<cudf::io::parquet::flatbuf::Message>(nullptr);
+}
+
+inline bool VerifySizePrefixedMessageBuffer(::flatbuffers::Verifier& verifier)
+{
+  return verifier.VerifySizePrefixedBuffer<cudf::io::parquet::flatbuf::Message>(nullptr);
+}
+
+inline void FinishMessageBuffer(::flatbuffers::FlatBufferBuilder& fbb,
+                                ::flatbuffers::Offset<cudf::io::parquet::flatbuf::Message> root)
+{
+  fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedMessageBuffer(
+  ::flatbuffers::FlatBufferBuilder& fbb,
+  ::flatbuffers::Offset<cudf::io::parquet::flatbuf::Message> root)
+{
+  fbb.FinishSizePrefixed(root);
+}
+
+}  // namespace flatbuf
+}  // namespace parquet
+}  // namespace io
+}  // namespace cudf
+
+#endif  // FLATBUFFERS_GENERATED_MESSAGE_CUDF_IO_PARQUET_FLATBUF_H_
diff --git a/cpp/src/io/parquet/ipc/Schema_generated.h b/cpp/src/io/parquet/ipc/Schema_generated.h
new file mode 100644
index 00000000000..27141b4af31
--- /dev/null
+++ b/cpp/src/io/parquet/ipc/Schema_generated.h
@@ -0,0 +1,2769 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+#ifndef FLATBUFFERS_GENERATED_SCHEMA_CUDF_IO_PARQUET_FLATBUF_H_
+#define FLATBUFFERS_GENERATED_SCHEMA_CUDF_IO_PARQUET_FLATBUF_H_
+
+#include <flatbuffers/flatbuffers.h>
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 && FLATBUFFERS_VERSION_MINOR == 3 &&
+                FLATBUFFERS_VERSION_REVISION == 25,
+              "Non-compatible flatbuffers version included");
+
+namespace cudf {
+namespace io {
+namespace parquet {
+namespace flatbuf {
+
+struct Null;
+struct NullBuilder;
+
+struct Struct_;
+struct Struct_Builder;
+
+struct List;
+struct ListBuilder;
+
+struct LargeList;
+struct LargeListBuilder;
+
+struct ListView;
+struct ListViewBuilder;
+
+struct LargeListView;
+struct LargeListViewBuilder;
+
+struct FixedSizeList;
+struct FixedSizeListBuilder;
+
+struct Map;
+struct MapBuilder;
+
+struct Union;
+struct UnionBuilder;
+
+struct Int;
+struct IntBuilder;
+
+struct FloatingPoint;
+struct FloatingPointBuilder;
+
+struct Utf8;
+struct Utf8Builder;
+
+struct Binary;
+struct BinaryBuilder;
+
+struct LargeUtf8;
+struct LargeUtf8Builder;
+
+struct LargeBinary;
+struct LargeBinaryBuilder;
+
+struct Utf8View;
+struct Utf8ViewBuilder;
+
+struct BinaryView;
+struct BinaryViewBuilder;
+
+struct FixedSizeBinary;
+struct FixedSizeBinaryBuilder;
+
+struct Bool;
+struct BoolBuilder;
+
+struct RunEndEncoded;
+struct RunEndEncodedBuilder;
+
+struct Decimal;
+struct DecimalBuilder;
+
+struct Date;
+struct DateBuilder;
+
+struct Time;
+struct TimeBuilder;
+
+struct Timestamp;
+struct TimestampBuilder;
+
+struct Interval;
+struct IntervalBuilder;
+
+struct Duration;
+struct DurationBuilder;
+
+struct KeyValue;
+struct KeyValueBuilder;
+
+struct DictionaryEncoding;
+struct DictionaryEncodingBuilder;
+
+struct Field;
+struct FieldBuilder;
+
+struct Buffer;
+
+struct Schema;
+struct SchemaBuilder;
+
+enum MetadataVersion : int16_t {
+  /// 0.1.0 (October 2016).
+  MetadataVersion_V1 = 0,
+  /// 0.2.0 (February 2017). Non-backwards compatible with V1.
+  MetadataVersion_V2 = 1,
+  /// 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2.
+  MetadataVersion_V3 = 2,
+  /// >= 0.8.0 (December 2017). Non-backwards compatible with V3.
+  MetadataVersion_V4 = 3,
+  /// >= 1.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4
+  /// metadata and IPC messages). Implementations are recommended to provide a
+  /// V4 compatibility mode with V5 format changes disabled.
+  ///
+  /// Incompatible changes between V4 and V5:
+  /// - Union buffer layout has changed. In V5, Unions don't have a validity
+  ///   bitmap buffer.
+  MetadataVersion_V5  = 4,
+  MetadataVersion_MIN = MetadataVersion_V1,
+  MetadataVersion_MAX = MetadataVersion_V5
+};
+
+inline const MetadataVersion (&EnumValuesMetadataVersion())[5]
+{
+  static const MetadataVersion values[] = {MetadataVersion_V1,
+                                           MetadataVersion_V2,
+                                           MetadataVersion_V3,
+                                           MetadataVersion_V4,
+                                           MetadataVersion_V5};
+  return values;
+}
+
+inline const char* const* EnumNamesMetadataVersion()
+{
+  static const char* const names[6] = {"V1", "V2", "V3", "V4", "V5", nullptr};
+  return names;
+}
+
+inline const char* EnumNameMetadataVersion(MetadataVersion e)
+{
+  if (::flatbuffers::IsOutRange(e, MetadataVersion_V1, MetadataVersion_V5)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesMetadataVersion()[index];
+}
+
+/// Represents Arrow Features that might not have full support
+/// within implementations. This is intended to be used in
+/// two scenarios:
+///  1.  A mechanism for readers of Arrow Streams
+///      and files to understand that the stream or file makes
+///      use of a feature that isn't supported or unknown to
+///      the implementation (and therefore can meet the Arrow
+///      forward compatibility guarantees).
+///  2.  A means of negotiating between a client and server
+///      what features a stream is allowed to use. The enums
+///      values here are intended to represent higher level
+///      features, additional details maybe negotiated
+///      with key-value pairs specific to the protocol.
+///
+/// Enums added to this list should be assigned power-of-two values
+/// to facilitate exchanging and comparing bitmaps for supported
+/// features.
+enum Feature : int64_t {
+  /// Needed to make flatbuffers happy.
+  Feature_UNUSED = 0,
+  /// The stream makes use of multiple full dictionaries with the
+  /// same ID and assumes clients implement dictionary replacement
+  /// correctly.
+  Feature_DICTIONARY_REPLACEMENT = 1LL,
+  /// The stream makes use of compressed bodies as described
+  /// in Message.fbs.
+  Feature_COMPRESSED_BODY = 2LL,
+  Feature_MIN             = Feature_UNUSED,
+  Feature_MAX             = Feature_COMPRESSED_BODY
+};
+
+inline const Feature (&EnumValuesFeature())[3]
+{
+  static const Feature values[] = {
+    Feature_UNUSED, Feature_DICTIONARY_REPLACEMENT, Feature_COMPRESSED_BODY};
+  return values;
+}
+
+inline const char* const* EnumNamesFeature()
+{
+  static const char* const names[4] = {
+    "UNUSED", "DICTIONARY_REPLACEMENT", "COMPRESSED_BODY", nullptr};
+  return names;
+}
+
+inline const char* EnumNameFeature(Feature e)
+{
+  if (::flatbuffers::IsOutRange(e, Feature_UNUSED, Feature_COMPRESSED_BODY)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesFeature()[index];
+}
+
+enum UnionMode : int16_t {
+  UnionMode_Sparse = 0,
+  UnionMode_Dense  = 1,
+  UnionMode_MIN    = UnionMode_Sparse,
+  UnionMode_MAX    = UnionMode_Dense
+};
+
+inline const UnionMode (&EnumValuesUnionMode())[2]
+{
+  static const UnionMode values[] = {UnionMode_Sparse, UnionMode_Dense};
+  return values;
+}
+
+inline const char* const* EnumNamesUnionMode()
+{
+  static const char* const names[3] = {"Sparse", "Dense", nullptr};
+  return names;
+}
+
+inline const char* EnumNameUnionMode(UnionMode e)
+{
+  if (::flatbuffers::IsOutRange(e, UnionMode_Sparse, UnionMode_Dense)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesUnionMode()[index];
+}
+
+enum Precision : int16_t {
+  Precision_HALF   = 0,
+  Precision_SINGLE = 1,
+  Precision_DOUBLE = 2,
+  Precision_MIN    = Precision_HALF,
+  Precision_MAX    = Precision_DOUBLE
+};
+
+inline const Precision (&EnumValuesPrecision())[3]
+{
+  static const Precision values[] = {Precision_HALF, Precision_SINGLE, Precision_DOUBLE};
+  return values;
+}
+
+inline const char* const* EnumNamesPrecision()
+{
+  static const char* const names[4] = {"HALF", "SINGLE", "DOUBLE", nullptr};
+  return names;
+}
+
+inline const char* EnumNamePrecision(Precision e)
+{
+  if (::flatbuffers::IsOutRange(e, Precision_HALF, Precision_DOUBLE)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesPrecision()[index];
+}
+
+enum DateUnit : int16_t {
+  DateUnit_DAY         = 0,
+  DateUnit_MILLISECOND = 1,
+  DateUnit_MIN         = DateUnit_DAY,
+  DateUnit_MAX         = DateUnit_MILLISECOND
+};
+
+inline const DateUnit (&EnumValuesDateUnit())[2]
+{
+  static const DateUnit values[] = {DateUnit_DAY, DateUnit_MILLISECOND};
+  return values;
+}
+
+inline const char* const* EnumNamesDateUnit()
+{
+  static const char* const names[3] = {"DAY", "MILLISECOND", nullptr};
+  return names;
+}
+
+inline const char* EnumNameDateUnit(DateUnit e)
+{
+  if (::flatbuffers::IsOutRange(e, DateUnit_DAY, DateUnit_MILLISECOND)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesDateUnit()[index];
+}
+
+enum TimeUnit : int16_t {
+  TimeUnit_SECOND      = 0,
+  TimeUnit_MILLISECOND = 1,
+  TimeUnit_MICROSECOND = 2,
+  TimeUnit_NANOSECOND  = 3,
+  TimeUnit_MIN         = TimeUnit_SECOND,
+  TimeUnit_MAX         = TimeUnit_NANOSECOND
+};
+
+inline const TimeUnit (&EnumValuesTimeUnit())[4]
+{
+  static const TimeUnit values[] = {
+    TimeUnit_SECOND, TimeUnit_MILLISECOND, TimeUnit_MICROSECOND, TimeUnit_NANOSECOND};
+  return values;
+}
+
+inline const char* const* EnumNamesTimeUnit()
+{
+  static const char* const names[5] = {
+    "SECOND", "MILLISECOND", "MICROSECOND", "NANOSECOND", nullptr};
+  return names;
+}
+
+inline const char* EnumNameTimeUnit(TimeUnit e)
+{
+  if (::flatbuffers::IsOutRange(e, TimeUnit_SECOND, TimeUnit_NANOSECOND)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesTimeUnit()[index];
+}
+
+enum IntervalUnit : int16_t {
+  IntervalUnit_YEAR_MONTH     = 0,
+  IntervalUnit_DAY_TIME       = 1,
+  IntervalUnit_MONTH_DAY_NANO = 2,
+  IntervalUnit_MIN            = IntervalUnit_YEAR_MONTH,
+  IntervalUnit_MAX            = IntervalUnit_MONTH_DAY_NANO
+};
+
+inline const IntervalUnit (&EnumValuesIntervalUnit())[3]
+{
+  static const IntervalUnit values[] = {
+    IntervalUnit_YEAR_MONTH, IntervalUnit_DAY_TIME, IntervalUnit_MONTH_DAY_NANO};
+  return values;
+}
+
+inline const char* const* EnumNamesIntervalUnit()
+{
+  static const char* const names[4] = {"YEAR_MONTH", "DAY_TIME", "MONTH_DAY_NANO", nullptr};
+  return names;
+}
+
+inline const char* EnumNameIntervalUnit(IntervalUnit e)
+{
+  if (::flatbuffers::IsOutRange(e, IntervalUnit_YEAR_MONTH, IntervalUnit_MONTH_DAY_NANO)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesIntervalUnit()[index];
+}
+
+/// ----------------------------------------------------------------------
+/// Top-level Type value, enabling extensible type-specific metadata. We can
+/// add new logical types to Type without breaking backwards compatibility
+enum Type : uint8_t {
+  Type_NONE            = 0,
+  Type_Null            = 1,
+  Type_Int             = 2,
+  Type_FloatingPoint   = 3,
+  Type_Binary          = 4,
+  Type_Utf8            = 5,
+  Type_Bool            = 6,
+  Type_Decimal         = 7,
+  Type_Date            = 8,
+  Type_Time            = 9,
+  Type_Timestamp       = 10,
+  Type_Interval        = 11,
+  Type_List            = 12,
+  Type_Struct_         = 13,
+  Type_Union           = 14,
+  Type_FixedSizeBinary = 15,
+  Type_FixedSizeList   = 16,
+  Type_Map             = 17,
+  Type_Duration        = 18,
+  Type_LargeBinary     = 19,
+  Type_LargeUtf8       = 20,
+  Type_LargeList       = 21,
+  Type_RunEndEncoded   = 22,
+  Type_BinaryView      = 23,
+  Type_Utf8View        = 24,
+  Type_ListView        = 25,
+  Type_LargeListView   = 26,
+  Type_MIN             = Type_NONE,
+  Type_MAX             = Type_LargeListView
+};
+
+inline const Type (&EnumValuesType())[27]
+{
+  static const Type values[] = {
+    Type_NONE,          Type_Null,      Type_Int,           Type_FloatingPoint,
+    Type_Binary,        Type_Utf8,      Type_Bool,          Type_Decimal,
+    Type_Date,          Type_Time,      Type_Timestamp,     Type_Interval,
+    Type_List,          Type_Struct_,   Type_Union,         Type_FixedSizeBinary,
+    Type_FixedSizeList, Type_Map,       Type_Duration,      Type_LargeBinary,
+    Type_LargeUtf8,     Type_LargeList, Type_RunEndEncoded, Type_BinaryView,
+    Type_Utf8View,      Type_ListView,  Type_LargeListView};
+  return values;
+}
+
+inline const char* const* EnumNamesType()
+{
+  static const char* const names[28] = {
+    "NONE",          "Null",      "Int",           "FloatingPoint",
+    "Binary",        "Utf8",      "Bool",          "Decimal",
+    "Date",          "Time",      "Timestamp",     "Interval",
+    "List",          "Struct_",   "Union",         "FixedSizeBinary",
+    "FixedSizeList", "Map",       "Duration",      "LargeBinary",
+    "LargeUtf8",     "LargeList", "RunEndEncoded", "BinaryView",
+    "Utf8View",      "ListView",  "LargeListView", nullptr};
+  return names;
+}
+
+inline const char* EnumNameType(Type e)
+{
+  if (::flatbuffers::IsOutRange(e, Type_NONE, Type_LargeListView)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesType()[index];
+}
+
+template <typename T>
+struct TypeTraits {
+  static const Type enum_value = Type_NONE;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Null> {
+  static const Type enum_value = Type_Null;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Int> {
+  static const Type enum_value = Type_Int;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::FloatingPoint> {
+  static const Type enum_value = Type_FloatingPoint;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Binary> {
+  static const Type enum_value = Type_Binary;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Utf8> {
+  static const Type enum_value = Type_Utf8;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Bool> {
+  static const Type enum_value = Type_Bool;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Decimal> {
+  static const Type enum_value = Type_Decimal;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Date> {
+  static const Type enum_value = Type_Date;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Time> {
+  static const Type enum_value = Type_Time;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Timestamp> {
+  static const Type enum_value = Type_Timestamp;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Interval> {
+  static const Type enum_value = Type_Interval;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::List> {
+  static const Type enum_value = Type_List;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Struct_> {
+  static const Type enum_value = Type_Struct_;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Union> {
+  static const Type enum_value = Type_Union;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::FixedSizeBinary> {
+  static const Type enum_value = Type_FixedSizeBinary;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::FixedSizeList> {
+  static const Type enum_value = Type_FixedSizeList;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Map> {
+  static const Type enum_value = Type_Map;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Duration> {
+  static const Type enum_value = Type_Duration;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::LargeBinary> {
+  static const Type enum_value = Type_LargeBinary;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::LargeUtf8> {
+  static const Type enum_value = Type_LargeUtf8;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::LargeList> {
+  static const Type enum_value = Type_LargeList;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::RunEndEncoded> {
+  static const Type enum_value = Type_RunEndEncoded;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::BinaryView> {
+  static const Type enum_value = Type_BinaryView;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Utf8View> {
+  static const Type enum_value = Type_Utf8View;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::ListView> {
+  static const Type enum_value = Type_ListView;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::LargeListView> {
+  static const Type enum_value = Type_LargeListView;
+};
+
+bool VerifyType(::flatbuffers::Verifier& verifier, const void* obj, Type type);
+bool VerifyTypeVector(::flatbuffers::Verifier& verifier,
+                      const ::flatbuffers::Vector<::flatbuffers::Offset<void>>* values,
+                      const ::flatbuffers::Vector<uint8_t>* types);
+
+/// ----------------------------------------------------------------------
+/// Dictionary encoding metadata
+/// Maintained for forwards compatibility, in the future
+/// Dictionaries might be explicit maps between integers and values
+/// allowing for non-contiguous index values
+enum DictionaryKind : int16_t {
+  DictionaryKind_DenseArray = 0,
+  DictionaryKind_MIN        = DictionaryKind_DenseArray,
+  DictionaryKind_MAX        = DictionaryKind_DenseArray
+};
+
+inline const DictionaryKind (&EnumValuesDictionaryKind())[1]
+{
+  static const DictionaryKind values[] = {DictionaryKind_DenseArray};
+  return values;
+}
+
+inline const char* const* EnumNamesDictionaryKind()
+{
+  static const char* const names[2] = {"DenseArray", nullptr};
+  return names;
+}
+
+inline const char* EnumNameDictionaryKind(DictionaryKind e)
+{
+  if (::flatbuffers::IsOutRange(e, DictionaryKind_DenseArray, DictionaryKind_DenseArray)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesDictionaryKind()[index];
+}
+
+/// ----------------------------------------------------------------------
+/// Endianness of the platform producing the data
+enum Endianness : int16_t {
+  Endianness_Little = 0,
+  Endianness_Big    = 1,
+  Endianness_MIN    = Endianness_Little,
+  Endianness_MAX    = Endianness_Big
+};
+
+inline const Endianness (&EnumValuesEndianness())[2]
+{
+  static const Endianness values[] = {Endianness_Little, Endianness_Big};
+  return values;
+}
+
+inline const char* const* EnumNamesEndianness()
+{
+  static const char* const names[3] = {"Little", "Big", nullptr};
+  return names;
+}
+
+inline const char* EnumNameEndianness(Endianness e)
+{
+  if (::flatbuffers::IsOutRange(e, Endianness_Little, Endianness_Big)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesEndianness()[index];
+}
+
+/// ----------------------------------------------------------------------
+/// A Buffer represents a single contiguous memory segment
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Buffer FLATBUFFERS_FINAL_CLASS
+{
+ private:
+  int64_t offset_;
+  int64_t length_;
+
+ public:
+  Buffer() : offset_(0), length_(0) {}
+  Buffer(int64_t _offset, int64_t _length)
+    : offset_(::flatbuffers::EndianScalar(_offset)), length_(::flatbuffers::EndianScalar(_length))
+  {
+  }
+  /// The relative offset into the shared memory page where the bytes for this
+  /// buffer starts
+  int64_t offset() const { return ::flatbuffers::EndianScalar(offset_); }
+  /// The absolute length (in bytes) of the memory buffer. The memory is found
+  /// from offset (inclusive) to offset + length (non-inclusive). When building
+  /// messages using the encapsulated IPC message, padding bytes may be written
+  /// after a buffer, but such padding bytes do not need to be accounted for in
+  /// the size here.
+  int64_t length() const { return ::flatbuffers::EndianScalar(length_); }
+};
+FLATBUFFERS_STRUCT_END(Buffer, 16);
+
+/// These are stored in the flatbuffer in the Type union below
+struct Null FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef NullBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct NullBuilder {
+  typedef Null Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit NullBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Null> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Null>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Null> CreateNull(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  NullBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// A Struct_ in the flatbuffer metadata is the same as an Arrow Struct
+/// (according to the physical memory layout). We used Struct_ here as
+/// Struct is a reserved word in Flatbuffers
+struct Struct_ FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Struct_Builder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct Struct_Builder {
+  typedef Struct_ Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit Struct_Builder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Struct_> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Struct_>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Struct_> CreateStruct_(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  Struct_Builder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct List FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ListBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct ListBuilder {
+  typedef List Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ListBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<List> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<List>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<List> CreateList(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  ListBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Same as List, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+struct LargeList FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LargeListBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct LargeListBuilder {
+  typedef LargeList Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LargeListBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LargeList> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<LargeList>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LargeList> CreateLargeList(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  LargeListBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Represents the same logical types that List can, but contains offsets and
+/// sizes allowing for writes in any order and sharing of child values among
+/// list values.
+struct ListView FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ListViewBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct ListViewBuilder {
+  typedef ListView Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ListViewBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ListView> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<ListView>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ListView> CreateListView(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  ListViewBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Same as ListView, but with 64-bit offsets and sizes, allowing to represent
+/// extremely large data values.
+struct LargeListView FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LargeListViewBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct LargeListViewBuilder {
+  typedef LargeListView Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LargeListViewBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LargeListView> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<LargeListView>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LargeListView> CreateLargeListView(
+  ::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  LargeListViewBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct FixedSizeList FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FixedSizeListBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_LISTSIZE = 4 };
+  /// Number of list items per value
+  int32_t listSize() const { return GetField<int32_t>(VT_LISTSIZE, 0); }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int32_t>(verifier, VT_LISTSIZE, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct FixedSizeListBuilder {
+  typedef FixedSizeList Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_listSize(int32_t listSize)
+  {
+    fbb_.AddElement<int32_t>(FixedSizeList::VT_LISTSIZE, listSize, 0);
+  }
+  explicit FixedSizeListBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<FixedSizeList> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<FixedSizeList>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<FixedSizeList> CreateFixedSizeList(
+  ::flatbuffers::FlatBufferBuilder& _fbb, int32_t listSize = 0)
+{
+  FixedSizeListBuilder builder_(_fbb);
+  builder_.add_listSize(listSize);
+  return builder_.Finish();
+}
+
+/// A Map is a logical nested type that is represented as
+///
+/// List<entries: Struct<key: K, value: V>>
+///
+/// In this layout, the keys and values are each respectively contiguous. We do
+/// not constrain the key and value types, so the application is responsible
+/// for ensuring that the keys are hashable and unique. Whether the keys are sorted
+/// may be set in the metadata for this field.
+///
+/// In a field with Map type, the field has a child Struct field, which then
+/// has two children: key type and the second the value type. The names of the
+/// child fields may be respectively "entries", "key", and "value", but this is
+/// not enforced.
+///
+/// Map
+/// ```text
+///   - child[0] entries: Struct
+///     - child[0] key: K
+///     - child[1] value: V
+/// ```
+/// Neither the "entries" field nor the "key" field may be nullable.
+///
+/// The metadata is structured so that Arrow systems without special handling
+/// for Map can make Map an alias for List. The "layout" attribute for the Map
+/// field must have the same contents as a List.
+struct Map FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MapBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_KEYSSORTED = 4 };
+  /// Set to true if the keys within each value are sorted
+  bool keysSorted() const { return GetField<uint8_t>(VT_KEYSSORTED, 0) != 0; }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<uint8_t>(verifier, VT_KEYSSORTED, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct MapBuilder {
+  typedef Map Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_keysSorted(bool keysSorted)
+  {
+    fbb_.AddElement<uint8_t>(Map::VT_KEYSSORTED, static_cast<uint8_t>(keysSorted), 0);
+  }
+  explicit MapBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Map> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Map>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Map> CreateMap(::flatbuffers::FlatBufferBuilder& _fbb,
+                                            bool keysSorted = false)
+{
+  MapBuilder builder_(_fbb);
+  builder_.add_keysSorted(keysSorted);
+  return builder_.Finish();
+}
+
+/// A union is a complex type with children in Field
+/// By default ids in the type vector refer to the offsets in the children
+/// optionally typeIds provides an indirection between the child offset and the type id
+/// for each child `typeIds[offset]` is the id used in the type vector
+struct Union FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef UnionBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_MODE = 4, VT_TYPEIDS = 6 };
+  cudf::io::parquet::flatbuf::UnionMode mode() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::UnionMode>(GetField<int16_t>(VT_MODE, 0));
+  }
+  const ::flatbuffers::Vector<int32_t>* typeIds() const
+  {
+    return GetPointer<const ::flatbuffers::Vector<int32_t>*>(VT_TYPEIDS);
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_MODE, 2) &&
+           VerifyOffset(verifier, VT_TYPEIDS) && verifier.VerifyVector(typeIds()) &&
+           verifier.EndTable();
+  }
+};
+
+struct UnionBuilder {
+  typedef Union Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_mode(cudf::io::parquet::flatbuf::UnionMode mode)
+  {
+    fbb_.AddElement<int16_t>(Union::VT_MODE, static_cast<int16_t>(mode), 0);
+  }
+  void add_typeIds(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> typeIds)
+  {
+    fbb_.AddOffset(Union::VT_TYPEIDS, typeIds);
+  }
+  explicit UnionBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Union> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Union>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Union> CreateUnion(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::UnionMode mode = cudf::io::parquet::flatbuf::UnionMode_Sparse,
+  ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> typeIds = 0)
+{
+  UnionBuilder builder_(_fbb);
+  builder_.add_typeIds(typeIds);
+  builder_.add_mode(mode);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Union> CreateUnionDirect(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::UnionMode mode = cudf::io::parquet::flatbuf::UnionMode_Sparse,
+  const std::vector<int32_t>* typeIds        = nullptr)
+{
+  auto typeIds__ = typeIds ? _fbb.CreateVector<int32_t>(*typeIds) : 0;
+  return cudf::io::parquet::flatbuf::CreateUnion(_fbb, mode, typeIds__);
+}
+
+struct Int FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef IntBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BITWIDTH  = 4,
+    VT_IS_SIGNED = 6
+  };
+  int32_t bitWidth() const { return GetField<int32_t>(VT_BITWIDTH, 0); }
+  bool is_signed() const { return GetField<uint8_t>(VT_IS_SIGNED, 0) != 0; }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int32_t>(verifier, VT_BITWIDTH, 4) &&
+           VerifyField<uint8_t>(verifier, VT_IS_SIGNED, 1) && verifier.EndTable();
+  }
+};
+
+struct IntBuilder {
+  typedef Int Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_bitWidth(int32_t bitWidth) { fbb_.AddElement<int32_t>(Int::VT_BITWIDTH, bitWidth, 0); }
+  void add_is_signed(bool is_signed)
+  {
+    fbb_.AddElement<uint8_t>(Int::VT_IS_SIGNED, static_cast<uint8_t>(is_signed), 0);
+  }
+  explicit IntBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Int> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Int>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Int> CreateInt(::flatbuffers::FlatBufferBuilder& _fbb,
+                                            int32_t bitWidth = 0,
+                                            bool is_signed   = false)
+{
+  IntBuilder builder_(_fbb);
+  builder_.add_bitWidth(bitWidth);
+  builder_.add_is_signed(is_signed);
+  return builder_.Finish();
+}
+
+struct FloatingPoint FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FloatingPointBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_PRECISION = 4 };
+  cudf::io::parquet::flatbuf::Precision precision() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::Precision>(GetField<int16_t>(VT_PRECISION, 0));
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_PRECISION, 2) &&
+           verifier.EndTable();
+  }
+};
+
+struct FloatingPointBuilder {
+  typedef FloatingPoint Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_precision(cudf::io::parquet::flatbuf::Precision precision)
+  {
+    fbb_.AddElement<int16_t>(FloatingPoint::VT_PRECISION, static_cast<int16_t>(precision), 0);
+  }
+  explicit FloatingPointBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<FloatingPoint> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<FloatingPoint>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<FloatingPoint> CreateFloatingPoint(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::Precision precision = cudf::io::parquet::flatbuf::Precision_HALF)
+{
+  FloatingPointBuilder builder_(_fbb);
+  builder_.add_precision(precision);
+  return builder_.Finish();
+}
+
+/// Unicode with UTF-8 encoding
+struct Utf8 FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Utf8Builder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct Utf8Builder {
+  typedef Utf8 Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit Utf8Builder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Utf8> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Utf8>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Utf8> CreateUtf8(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  Utf8Builder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Opaque binary data
+struct Binary FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BinaryBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct BinaryBuilder {
+  typedef Binary Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit BinaryBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Binary> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Binary>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Binary> CreateBinary(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  BinaryBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Same as Utf8, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+struct LargeUtf8 FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LargeUtf8Builder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct LargeUtf8Builder {
+  typedef LargeUtf8 Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LargeUtf8Builder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LargeUtf8> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<LargeUtf8>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LargeUtf8> CreateLargeUtf8(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  LargeUtf8Builder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Same as Binary, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+struct LargeBinary FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LargeBinaryBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct LargeBinaryBuilder {
+  typedef LargeBinary Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LargeBinaryBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LargeBinary> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<LargeBinary>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LargeBinary> CreateLargeBinary(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  LargeBinaryBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Logically the same as Utf8, but the internal representation uses a view
+/// struct that contains the string length and either the string's entire data
+/// inline (for small strings) or an inlined prefix, an index of another buffer,
+/// and an offset pointing to a slice in that buffer (for non-small strings).
+///
+/// Since it uses a variable number of data buffers, each Field with this type
+/// must have a corresponding entry in `variadicBufferCounts`.
+struct Utf8View FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Utf8ViewBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct Utf8ViewBuilder {
+  typedef Utf8View Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit Utf8ViewBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Utf8View> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Utf8View>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Utf8View> CreateUtf8View(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  Utf8ViewBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Logically the same as Binary, but the internal representation uses a view
+/// struct that contains the string length and either the string's entire data
+/// inline (for small strings) or an inlined prefix, an index of another buffer,
+/// and an offset pointing to a slice in that buffer (for non-small strings).
+///
+/// Since it uses a variable number of data buffers, each Field with this type
+/// must have a corresponding entry in `variadicBufferCounts`.
+struct BinaryView FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BinaryViewBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct BinaryViewBuilder {
+  typedef BinaryView Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit BinaryViewBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BinaryView> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<BinaryView>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BinaryView> CreateBinaryView(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  BinaryViewBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct FixedSizeBinary FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FixedSizeBinaryBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_BYTEWIDTH = 4 };
+  /// Number of bytes per value
+  int32_t byteWidth() const { return GetField<int32_t>(VT_BYTEWIDTH, 0); }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int32_t>(verifier, VT_BYTEWIDTH, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct FixedSizeBinaryBuilder {
+  typedef FixedSizeBinary Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_byteWidth(int32_t byteWidth)
+  {
+    fbb_.AddElement<int32_t>(FixedSizeBinary::VT_BYTEWIDTH, byteWidth, 0);
+  }
+  explicit FixedSizeBinaryBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<FixedSizeBinary> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<FixedSizeBinary>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<FixedSizeBinary> CreateFixedSizeBinary(
+  ::flatbuffers::FlatBufferBuilder& _fbb, int32_t byteWidth = 0)
+{
+  FixedSizeBinaryBuilder builder_(_fbb);
+  builder_.add_byteWidth(byteWidth);
+  return builder_.Finish();
+}
+
+struct Bool FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BoolBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct BoolBuilder {
+  typedef Bool Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit BoolBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Bool> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Bool>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Bool> CreateBool(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  BoolBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Contains two child arrays, run_ends and values.
+/// The run_ends child array must be a 16/32/64-bit integer array
+/// which encodes the indices at which the run with the value in
+/// each corresponding index in the values child array ends.
+/// Like list/struct types, the value array can be of any type.
+struct RunEndEncoded FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef RunEndEncodedBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct RunEndEncodedBuilder {
+  typedef RunEndEncoded Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit RunEndEncodedBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<RunEndEncoded> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<RunEndEncoded>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<RunEndEncoded> CreateRunEndEncoded(
+  ::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  RunEndEncodedBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Exact decimal value represented as an integer value in two's
+/// complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers
+/// are used. The representation uses the endianness indicated
+/// in the Schema.
+struct Decimal FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DecimalBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PRECISION = 4,
+    VT_SCALE     = 6,
+    VT_BITWIDTH  = 8
+  };
+  /// Total number of decimal digits
+  int32_t precision() const { return GetField<int32_t>(VT_PRECISION, 0); }
+  /// Number of digits after the decimal point "."
+  int32_t scale() const { return GetField<int32_t>(VT_SCALE, 0); }
+  /// Number of bits per value. The only accepted widths are 128 and 256.
+  /// We use bitWidth for consistency with Int::bitWidth.
+  int32_t bitWidth() const { return GetField<int32_t>(VT_BITWIDTH, 128); }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int32_t>(verifier, VT_PRECISION, 4) &&
+           VerifyField<int32_t>(verifier, VT_SCALE, 4) &&
+           VerifyField<int32_t>(verifier, VT_BITWIDTH, 4) && verifier.EndTable();
+  }
+};
+
+struct DecimalBuilder {
+  typedef Decimal Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_precision(int32_t precision)
+  {
+    fbb_.AddElement<int32_t>(Decimal::VT_PRECISION, precision, 0);
+  }
+  void add_scale(int32_t scale) { fbb_.AddElement<int32_t>(Decimal::VT_SCALE, scale, 0); }
+  void add_bitWidth(int32_t bitWidth)
+  {
+    fbb_.AddElement<int32_t>(Decimal::VT_BITWIDTH, bitWidth, 128);
+  }
+  explicit DecimalBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Decimal> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Decimal>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Decimal> CreateDecimal(::flatbuffers::FlatBufferBuilder& _fbb,
+                                                    int32_t precision = 0,
+                                                    int32_t scale     = 0,
+                                                    int32_t bitWidth  = 128)
+{
+  DecimalBuilder builder_(_fbb);
+  builder_.add_bitWidth(bitWidth);
+  builder_.add_scale(scale);
+  builder_.add_precision(precision);
+  return builder_.Finish();
+}
+
+/// Date is either a 32-bit or 64-bit signed integer type representing an
+/// elapsed time since UNIX epoch (1970-01-01), stored in either of two units:
+///
+/// * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no
+///   leap seconds), where the values are evenly divisible by 86400000
+/// * Days (32 bits) since the UNIX epoch
+struct Date FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DateBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_UNIT = 4 };
+  cudf::io::parquet::flatbuf::DateUnit unit() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::DateUnit>(GetField<int16_t>(VT_UNIT, 1));
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_UNIT, 2) &&
+           verifier.EndTable();
+  }
+};
+
+struct DateBuilder {
+  typedef Date Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_unit(cudf::io::parquet::flatbuf::DateUnit unit)
+  {
+    fbb_.AddElement<int16_t>(Date::VT_UNIT, static_cast<int16_t>(unit), 1);
+  }
+  explicit DateBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Date> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Date>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Date> CreateDate(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::DateUnit unit = cudf::io::parquet::flatbuf::DateUnit_MILLISECOND)
+{
+  DateBuilder builder_(_fbb);
+  builder_.add_unit(unit);
+  return builder_.Finish();
+}
+
+/// Time is either a 32-bit or 64-bit signed integer type representing an
+/// elapsed time since midnight, stored in either of four units: seconds,
+/// milliseconds, microseconds or nanoseconds.
+///
+/// The integer `bitWidth` depends on the `unit` and must be one of the following:
+/// * SECOND and MILLISECOND: 32 bits
+/// * MICROSECOND and NANOSECOND: 64 bits
+///
+/// The allowed values are between 0 (inclusive) and 86400 (=24*60*60) seconds
+/// (exclusive), adjusted for the time unit (for example, up to 86400000
+/// exclusive for the MILLISECOND unit).
+/// This definition doesn't allow for leap seconds. Time values from
+/// measurements with leap seconds will need to be corrected when ingesting
+/// into Arrow (for example by replacing the value 86400 with 86399).
+struct Time FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TimeBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_UNIT = 4, VT_BITWIDTH = 6 };
+  cudf::io::parquet::flatbuf::TimeUnit unit() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::TimeUnit>(GetField<int16_t>(VT_UNIT, 1));
+  }
+  int32_t bitWidth() const { return GetField<int32_t>(VT_BITWIDTH, 32); }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_UNIT, 2) &&
+           VerifyField<int32_t>(verifier, VT_BITWIDTH, 4) && verifier.EndTable();
+  }
+};
+
+struct TimeBuilder {
+  typedef Time Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_unit(cudf::io::parquet::flatbuf::TimeUnit unit)
+  {
+    fbb_.AddElement<int16_t>(Time::VT_UNIT, static_cast<int16_t>(unit), 1);
+  }
+  void add_bitWidth(int32_t bitWidth) { fbb_.AddElement<int32_t>(Time::VT_BITWIDTH, bitWidth, 32); }
+  explicit TimeBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Time> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Time>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Time> CreateTime(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::TimeUnit unit = cudf::io::parquet::flatbuf::TimeUnit_MILLISECOND,
+  int32_t bitWidth                          = 32)
+{
+  TimeBuilder builder_(_fbb);
+  builder_.add_bitWidth(bitWidth);
+  builder_.add_unit(unit);
+  return builder_.Finish();
+}
+
+/// Timestamp is a 64-bit signed integer representing an elapsed time since a
+/// fixed epoch, stored in either of four units: seconds, milliseconds,
+/// microseconds or nanoseconds, and is optionally annotated with a timezone.
+///
+/// Timestamp values do not include any leap seconds (in other words, all
+/// days are considered 86400 seconds long).
+///
+/// Timestamps with a non-empty timezone
+/// ------------------------------------
+///
+/// If a Timestamp column has a non-empty timezone value, its epoch is
+/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
+/// (the Unix epoch), regardless of the Timestamp's own timezone.
+///
+/// Therefore, timestamp values with a non-empty timezone correspond to
+/// physical points in time together with some additional information about
+/// how the data was obtained and/or how to display it (the timezone).
+///
+///   For example, the timestamp value 0 with the timezone string "Europe/Paris"
+///   corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
+///   application may prefer to display it as "January 1st 1970, 01h00" in
+///   the Europe/Paris timezone (which is the same physical point in time).
+///
+/// One consequence is that timestamp values with a non-empty timezone
+/// can be compared and ordered directly, since they all share the same
+/// well-known point of reference (the Unix epoch).
+///
+/// Timestamps with an unset / empty timezone
+/// -----------------------------------------
+///
+/// If a Timestamp column has no timezone value, its epoch is
+/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone.
+///
+/// Therefore, timestamp values without a timezone cannot be meaningfully
+/// interpreted as physical points in time, but only as calendar / clock
+/// indications ("wall clock time") in an unspecified timezone.
+///
+///   For example, the timestamp value 0 with an empty timezone string
+///   corresponds to "January 1st 1970, 00h00" in an unknown timezone: there
+///   is not enough information to interpret it as a well-defined physical
+///   point in time.
+///
+/// One consequence is that timestamp values without a timezone cannot
+/// be reliably compared or ordered, since they may have different points of
+/// reference.  In particular, it is *not* possible to interpret an unset
+/// or empty timezone as the same as "UTC".
+///
+/// Conversion between timezones
+/// ----------------------------
+///
+/// If a Timestamp column has a non-empty timezone, changing the timezone
+/// to a different non-empty value is a metadata-only operation:
+/// the timestamp values need not change as their point of reference remains
+/// the same (the Unix epoch).
+///
+/// However, if a Timestamp column has no timezone value, changing it to a
+/// non-empty value requires to think about the desired semantics.
+/// One possibility is to assume that the original timestamp values are
+/// relative to the epoch of the timezone being set; timestamp values should
+/// then adjusted to the Unix epoch (for example, changing the timezone from
+/// empty to "Europe/Paris" would require converting the timestamp values
+/// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
+/// nevertheless correct).
+///
+/// Guidelines for encoding data from external libraries
+/// ----------------------------------------------------
+///
+/// Date & time libraries often have multiple different data types for temporal
+/// data. In order to ease interoperability between different implementations the
+/// Arrow project has some recommendations for encoding these types into a Timestamp
+/// column.
+///
+/// An "instant" represents a physical point in time that has no relevant timezone
+/// (for example, astronomical data). To encode an instant, use a Timestamp with
+/// the timezone string set to "UTC", and make sure the Timestamp values
+/// are relative to the UTC epoch (January 1st 1970, midnight).
+///
+/// A "zoned date-time" represents a physical point in time annotated with an
+/// informative timezone (for example, the timezone in which the data was
+/// recorded).  To encode a zoned date-time, use a Timestamp with the timezone
+/// string set to the name of the timezone, and make sure the Timestamp values
+/// are relative to the UTC epoch (January 1st 1970, midnight).
+///
+///  (There is some ambiguity between an instant and a zoned date-time with the
+///   UTC timezone.  Both of these are stored the same in Arrow.  Typically,
+///   this distinction does not matter.  If it does, then an application should
+///   use custom metadata or an extension type to distinguish between the two cases.)
+///
+/// An "offset date-time" represents a physical point in time combined with an
+/// explicit offset from UTC.  To encode an offset date-time, use a Timestamp
+/// with the timezone string set to the numeric timezone offset string
+/// (e.g. "+03:00"), and make sure the Timestamp values are relative to
+/// the UTC epoch (January 1st 1970, midnight).
+///
+/// A "naive date-time" (also called "local date-time" in some libraries)
+/// represents a wall clock time combined with a calendar date, but with
+/// no indication of how to map this information to a physical point in time.
+/// Naive date-times must be handled with care because of this missing
+/// information, and also because daylight saving time (DST) may make
+/// some values ambiguous or nonexistent. A naive date-time may be
+/// stored as a struct with Date and Time fields. However, it may also be
+/// encoded into a Timestamp column with an empty timezone. The timestamp
+/// values should be computed "as if" the timezone of the date-time values
+/// was UTC; for example, the naive date-time "January 1st 1970, 00h00" would
+/// be encoded as timestamp value 0.
+struct Timestamp FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TimestampBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_UNIT = 4, VT_TIMEZONE = 6 };
+  cudf::io::parquet::flatbuf::TimeUnit unit() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::TimeUnit>(GetField<int16_t>(VT_UNIT, 0));
+  }
+  /// The timezone is an optional string indicating the name of a timezone,
+  /// one of:
+  ///
+  /// * As used in the Olson timezone database (the "tz database" or
+  ///   "tzdata"), such as "America/New_York".
+  /// * An absolute timezone offset of the form "+XX:XX" or "-XX:XX",
+  ///   such as "+07:30".
+  ///
+  /// Whether a timezone string is present indicates different semantics about
+  /// the data (see above).
+  const ::flatbuffers::String* timezone() const
+  {
+    return GetPointer<const ::flatbuffers::String*>(VT_TIMEZONE);
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_UNIT, 2) &&
+           VerifyOffset(verifier, VT_TIMEZONE) && verifier.VerifyString(timezone()) &&
+           verifier.EndTable();
+  }
+};
+
+struct TimestampBuilder {
+  typedef Timestamp Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_unit(cudf::io::parquet::flatbuf::TimeUnit unit)
+  {
+    fbb_.AddElement<int16_t>(Timestamp::VT_UNIT, static_cast<int16_t>(unit), 0);
+  }
+  void add_timezone(::flatbuffers::Offset<::flatbuffers::String> timezone)
+  {
+    fbb_.AddOffset(Timestamp::VT_TIMEZONE, timezone);
+  }
+  explicit TimestampBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Timestamp> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Timestamp>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Timestamp> CreateTimestamp(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::TimeUnit unit = cudf::io::parquet::flatbuf::TimeUnit_SECOND,
+  ::flatbuffers::Offset<::flatbuffers::String> timezone = 0)
+{
+  TimestampBuilder builder_(_fbb);
+  builder_.add_timezone(timezone);
+  builder_.add_unit(unit);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Timestamp> CreateTimestampDirect(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::TimeUnit unit = cudf::io::parquet::flatbuf::TimeUnit_SECOND,
+  const char* timezone                      = nullptr)
+{
+  auto timezone__ = timezone ? _fbb.CreateString(timezone) : 0;
+  return cudf::io::parquet::flatbuf::CreateTimestamp(_fbb, unit, timezone__);
+}
+
+struct Interval FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef IntervalBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_UNIT = 4 };
+  cudf::io::parquet::flatbuf::IntervalUnit unit() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::IntervalUnit>(GetField<int16_t>(VT_UNIT, 0));
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_UNIT, 2) &&
+           verifier.EndTable();
+  }
+};
+
+struct IntervalBuilder {
+  typedef Interval Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_unit(cudf::io::parquet::flatbuf::IntervalUnit unit)
+  {
+    fbb_.AddElement<int16_t>(Interval::VT_UNIT, static_cast<int16_t>(unit), 0);
+  }
+  explicit IntervalBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Interval> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Interval>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Interval> CreateInterval(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::IntervalUnit unit =
+    cudf::io::parquet::flatbuf::IntervalUnit_YEAR_MONTH)
+{
+  IntervalBuilder builder_(_fbb);
+  builder_.add_unit(unit);
+  return builder_.Finish();
+}
+
+struct Duration FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DurationBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_UNIT = 4 };
+  cudf::io::parquet::flatbuf::TimeUnit unit() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::TimeUnit>(GetField<int16_t>(VT_UNIT, 1));
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_UNIT, 2) &&
+           verifier.EndTable();
+  }
+};
+
+struct DurationBuilder {
+  typedef Duration Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_unit(cudf::io::parquet::flatbuf::TimeUnit unit)
+  {
+    fbb_.AddElement<int16_t>(Duration::VT_UNIT, static_cast<int16_t>(unit), 1);
+  }
+  explicit DurationBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Duration> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Duration>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Duration> CreateDuration(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::TimeUnit unit = cudf::io::parquet::flatbuf::TimeUnit_MILLISECOND)
+{
+  DurationBuilder builder_(_fbb);
+  builder_.add_unit(unit);
+  return builder_.Finish();
+}
+
+/// ----------------------------------------------------------------------
+/// user defined key value pairs to add custom metadata to arrow
+/// key namespacing is the responsibility of the user
+struct KeyValue FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef KeyValueBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_KEY = 4, VT_VALUE = 6 };
+  const ::flatbuffers::String* key() const
+  {
+    return GetPointer<const ::flatbuffers::String*>(VT_KEY);
+  }
+  const ::flatbuffers::String* value() const
+  {
+    return GetPointer<const ::flatbuffers::String*>(VT_VALUE);
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_KEY) &&
+           verifier.VerifyString(key()) && VerifyOffset(verifier, VT_VALUE) &&
+           verifier.VerifyString(value()) && verifier.EndTable();
+  }
+};
+
+struct KeyValueBuilder {
+  typedef KeyValue Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_key(::flatbuffers::Offset<::flatbuffers::String> key)
+  {
+    fbb_.AddOffset(KeyValue::VT_KEY, key);
+  }
+  void add_value(::flatbuffers::Offset<::flatbuffers::String> value)
+  {
+    fbb_.AddOffset(KeyValue::VT_VALUE, value);
+  }
+  explicit KeyValueBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<KeyValue> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<KeyValue>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<KeyValue> CreateKeyValue(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  ::flatbuffers::Offset<::flatbuffers::String> key   = 0,
+  ::flatbuffers::Offset<::flatbuffers::String> value = 0)
+{
+  KeyValueBuilder builder_(_fbb);
+  builder_.add_value(value);
+  builder_.add_key(key);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<KeyValue> CreateKeyValueDirect(::flatbuffers::FlatBufferBuilder& _fbb,
+                                                            const char* key   = nullptr,
+                                                            const char* value = nullptr)
+{
+  auto key__   = key ? _fbb.CreateString(key) : 0;
+  auto value__ = value ? _fbb.CreateString(value) : 0;
+  return cudf::io::parquet::flatbuf::CreateKeyValue(_fbb, key__, value__);
+}
+
+struct DictionaryEncoding FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DictionaryEncodingBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ID             = 4,
+    VT_INDEXTYPE      = 6,
+    VT_ISORDERED      = 8,
+    VT_DICTIONARYKIND = 10
+  };
+  /// The known dictionary id in the application where this data is used. In
+  /// the file or streaming formats, the dictionary ids are found in the
+  /// DictionaryBatch messages
+  int64_t id() const { return GetField<int64_t>(VT_ID, 0); }
+  /// The dictionary indices are constrained to be non-negative integers. If
+  /// this field is null, the indices must be signed int32. To maximize
+  /// cross-language compatibility and performance, implementations are
+  /// recommended to prefer signed integer types over unsigned integer types
+  /// and to avoid uint64 indices unless they are required by an application.
+  const cudf::io::parquet::flatbuf::Int* indexType() const
+  {
+    return GetPointer<const cudf::io::parquet::flatbuf::Int*>(VT_INDEXTYPE);
+  }
+  /// By default, dictionaries are not ordered, or the order does not have
+  /// semantic meaning. In some statistical, applications, dictionary-encoding
+  /// is used to represent ordered categorical data, and we provide a way to
+  /// preserve that metadata here
+  bool isOrdered() const { return GetField<uint8_t>(VT_ISORDERED, 0) != 0; }
+  cudf::io::parquet::flatbuf::DictionaryKind dictionaryKind() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::DictionaryKind>(
+      GetField<int16_t>(VT_DICTIONARYKIND, 0));
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int64_t>(verifier, VT_ID, 8) &&
+           VerifyOffset(verifier, VT_INDEXTYPE) && verifier.VerifyTable(indexType()) &&
+           VerifyField<uint8_t>(verifier, VT_ISORDERED, 1) &&
+           VerifyField<int16_t>(verifier, VT_DICTIONARYKIND, 2) && verifier.EndTable();
+  }
+};
+
+struct DictionaryEncodingBuilder {
+  typedef DictionaryEncoding Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_id(int64_t id) { fbb_.AddElement<int64_t>(DictionaryEncoding::VT_ID, id, 0); }
+  void add_indexType(::flatbuffers::Offset<cudf::io::parquet::flatbuf::Int> indexType)
+  {
+    fbb_.AddOffset(DictionaryEncoding::VT_INDEXTYPE, indexType);
+  }
+  void add_isOrdered(bool isOrdered)
+  {
+    fbb_.AddElement<uint8_t>(DictionaryEncoding::VT_ISORDERED, static_cast<uint8_t>(isOrdered), 0);
+  }
+  void add_dictionaryKind(cudf::io::parquet::flatbuf::DictionaryKind dictionaryKind)
+  {
+    fbb_.AddElement<int16_t>(
+      DictionaryEncoding::VT_DICTIONARYKIND, static_cast<int16_t>(dictionaryKind), 0);
+  }
+  explicit DictionaryEncodingBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DictionaryEncoding> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<DictionaryEncoding>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DictionaryEncoding> CreateDictionaryEncoding(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  int64_t id                                                       = 0,
+  ::flatbuffers::Offset<cudf::io::parquet::flatbuf::Int> indexType = 0,
+  bool isOrdered                                                   = false,
+  cudf::io::parquet::flatbuf::DictionaryKind dictionaryKind =
+    cudf::io::parquet::flatbuf::DictionaryKind_DenseArray)
+{
+  DictionaryEncodingBuilder builder_(_fbb);
+  builder_.add_id(id);
+  builder_.add_indexType(indexType);
+  builder_.add_dictionaryKind(dictionaryKind);
+  builder_.add_isOrdered(isOrdered);
+  return builder_.Finish();
+}
+
+/// ----------------------------------------------------------------------
+/// A field represents a named column in a record / row batch or child of a
+/// nested type.
+struct Field FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FieldBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME            = 4,
+    VT_NULLABLE        = 6,
+    VT_TYPE_TYPE       = 8,
+    VT_TYPE            = 10,
+    VT_DICTIONARY      = 12,
+    VT_CHILDREN        = 14,
+    VT_CUSTOM_METADATA = 16
+  };
+  /// Name is not required, in i.e. a List
+  const ::flatbuffers::String* name() const
+  {
+    return GetPointer<const ::flatbuffers::String*>(VT_NAME);
+  }
+  /// Whether or not this field can contain nulls. Should be true in general.
+  bool nullable() const { return GetField<uint8_t>(VT_NULLABLE, 0) != 0; }
+  cudf::io::parquet::flatbuf::Type type_type() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::Type>(GetField<uint8_t>(VT_TYPE_TYPE, 0));
+  }
+  /// This is the type of the decoded value if the field is dictionary encoded.
+  const void* type() const { return GetPointer<const void*>(VT_TYPE); }
+  template <typename T>
+  const T* type_as() const;
+  const cudf::io::parquet::flatbuf::Null* type_as_Null() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Null
+             ? static_cast<const cudf::io::parquet::flatbuf::Null*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Int* type_as_Int() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Int
+             ? static_cast<const cudf::io::parquet::flatbuf::Int*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::FloatingPoint* type_as_FloatingPoint() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_FloatingPoint
+             ? static_cast<const cudf::io::parquet::flatbuf::FloatingPoint*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Binary* type_as_Binary() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Binary
+             ? static_cast<const cudf::io::parquet::flatbuf::Binary*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Utf8* type_as_Utf8() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Utf8
+             ? static_cast<const cudf::io::parquet::flatbuf::Utf8*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Bool* type_as_Bool() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Bool
+             ? static_cast<const cudf::io::parquet::flatbuf::Bool*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Decimal* type_as_Decimal() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Decimal
+             ? static_cast<const cudf::io::parquet::flatbuf::Decimal*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Date* type_as_Date() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Date
+             ? static_cast<const cudf::io::parquet::flatbuf::Date*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Time* type_as_Time() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Time
+             ? static_cast<const cudf::io::parquet::flatbuf::Time*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Timestamp* type_as_Timestamp() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Timestamp
+             ? static_cast<const cudf::io::parquet::flatbuf::Timestamp*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Interval* type_as_Interval() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Interval
+             ? static_cast<const cudf::io::parquet::flatbuf::Interval*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::List* type_as_List() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_List
+             ? static_cast<const cudf::io::parquet::flatbuf::List*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Struct_* type_as_Struct_() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Struct_
+             ? static_cast<const cudf::io::parquet::flatbuf::Struct_*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Union* type_as_Union() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Union
+             ? static_cast<const cudf::io::parquet::flatbuf::Union*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::FixedSizeBinary* type_as_FixedSizeBinary() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_FixedSizeBinary
+             ? static_cast<const cudf::io::parquet::flatbuf::FixedSizeBinary*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::FixedSizeList* type_as_FixedSizeList() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_FixedSizeList
+             ? static_cast<const cudf::io::parquet::flatbuf::FixedSizeList*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Map* type_as_Map() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Map
+             ? static_cast<const cudf::io::parquet::flatbuf::Map*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Duration* type_as_Duration() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Duration
+             ? static_cast<const cudf::io::parquet::flatbuf::Duration*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::LargeBinary* type_as_LargeBinary() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_LargeBinary
+             ? static_cast<const cudf::io::parquet::flatbuf::LargeBinary*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::LargeUtf8* type_as_LargeUtf8() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_LargeUtf8
+             ? static_cast<const cudf::io::parquet::flatbuf::LargeUtf8*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::LargeList* type_as_LargeList() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_LargeList
+             ? static_cast<const cudf::io::parquet::flatbuf::LargeList*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::RunEndEncoded* type_as_RunEndEncoded() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_RunEndEncoded
+             ? static_cast<const cudf::io::parquet::flatbuf::RunEndEncoded*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::BinaryView* type_as_BinaryView() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_BinaryView
+             ? static_cast<const cudf::io::parquet::flatbuf::BinaryView*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Utf8View* type_as_Utf8View() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Utf8View
+             ? static_cast<const cudf::io::parquet::flatbuf::Utf8View*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::ListView* type_as_ListView() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_ListView
+             ? static_cast<const cudf::io::parquet::flatbuf::ListView*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::LargeListView* type_as_LargeListView() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_LargeListView
+             ? static_cast<const cudf::io::parquet::flatbuf::LargeListView*>(type())
+             : nullptr;
+  }
+  /// Present only if the field is dictionary encoded.
+  const cudf::io::parquet::flatbuf::DictionaryEncoding* dictionary() const
+  {
+    return GetPointer<const cudf::io::parquet::flatbuf::DictionaryEncoding*>(VT_DICTIONARY);
+  }
+  /// children apply only to nested data types like Struct, List and Union. For
+  /// primitive types children will have length 0.
+  const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>* children()
+    const
+  {
+    return GetPointer<
+      const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>*>(
+      VT_CHILDREN);
+  }
+  /// User-defined metadata
+  const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>*
+  custom_metadata() const
+  {
+    return GetPointer<
+      const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>*>(
+      VT_CUSTOM_METADATA);
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) && VerifyField<uint8_t>(verifier, VT_NULLABLE, 1) &&
+           VerifyField<uint8_t>(verifier, VT_TYPE_TYPE, 1) && VerifyOffset(verifier, VT_TYPE) &&
+           VerifyType(verifier, type(), type_type()) && VerifyOffset(verifier, VT_DICTIONARY) &&
+           verifier.VerifyTable(dictionary()) && VerifyOffset(verifier, VT_CHILDREN) &&
+           verifier.VerifyVector(children()) && verifier.VerifyVectorOfTables(children()) &&
+           VerifyOffset(verifier, VT_CUSTOM_METADATA) && verifier.VerifyVector(custom_metadata()) &&
+           verifier.VerifyVectorOfTables(custom_metadata()) && verifier.EndTable();
+  }
+};
+
+template <>
+inline const cudf::io::parquet::flatbuf::Null* Field::type_as<cudf::io::parquet::flatbuf::Null>()
+  const
+{
+  return type_as_Null();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Int* Field::type_as<cudf::io::parquet::flatbuf::Int>()
+  const
+{
+  return type_as_Int();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::FloatingPoint*
+Field::type_as<cudf::io::parquet::flatbuf::FloatingPoint>() const
+{
+  return type_as_FloatingPoint();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Binary*
+Field::type_as<cudf::io::parquet::flatbuf::Binary>() const
+{
+  return type_as_Binary();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Utf8* Field::type_as<cudf::io::parquet::flatbuf::Utf8>()
+  const
+{
+  return type_as_Utf8();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Bool* Field::type_as<cudf::io::parquet::flatbuf::Bool>()
+  const
+{
+  return type_as_Bool();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Decimal*
+Field::type_as<cudf::io::parquet::flatbuf::Decimal>() const
+{
+  return type_as_Decimal();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Date* Field::type_as<cudf::io::parquet::flatbuf::Date>()
+  const
+{
+  return type_as_Date();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Time* Field::type_as<cudf::io::parquet::flatbuf::Time>()
+  const
+{
+  return type_as_Time();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Timestamp*
+Field::type_as<cudf::io::parquet::flatbuf::Timestamp>() const
+{
+  return type_as_Timestamp();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Interval*
+Field::type_as<cudf::io::parquet::flatbuf::Interval>() const
+{
+  return type_as_Interval();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::List* Field::type_as<cudf::io::parquet::flatbuf::List>()
+  const
+{
+  return type_as_List();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Struct_*
+Field::type_as<cudf::io::parquet::flatbuf::Struct_>() const
+{
+  return type_as_Struct_();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Union* Field::type_as<cudf::io::parquet::flatbuf::Union>()
+  const
+{
+  return type_as_Union();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::FixedSizeBinary*
+Field::type_as<cudf::io::parquet::flatbuf::FixedSizeBinary>() const
+{
+  return type_as_FixedSizeBinary();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::FixedSizeList*
+Field::type_as<cudf::io::parquet::flatbuf::FixedSizeList>() const
+{
+  return type_as_FixedSizeList();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Map* Field::type_as<cudf::io::parquet::flatbuf::Map>()
+  const
+{
+  return type_as_Map();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Duration*
+Field::type_as<cudf::io::parquet::flatbuf::Duration>() const
+{
+  return type_as_Duration();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::LargeBinary*
+Field::type_as<cudf::io::parquet::flatbuf::LargeBinary>() const
+{
+  return type_as_LargeBinary();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::LargeUtf8*
+Field::type_as<cudf::io::parquet::flatbuf::LargeUtf8>() const
+{
+  return type_as_LargeUtf8();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::LargeList*
+Field::type_as<cudf::io::parquet::flatbuf::LargeList>() const
+{
+  return type_as_LargeList();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::RunEndEncoded*
+Field::type_as<cudf::io::parquet::flatbuf::RunEndEncoded>() const
+{
+  return type_as_RunEndEncoded();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::BinaryView*
+Field::type_as<cudf::io::parquet::flatbuf::BinaryView>() const
+{
+  return type_as_BinaryView();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Utf8View*
+Field::type_as<cudf::io::parquet::flatbuf::Utf8View>() const
+{
+  return type_as_Utf8View();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::ListView*
+Field::type_as<cudf::io::parquet::flatbuf::ListView>() const
+{
+  return type_as_ListView();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::LargeListView*
+Field::type_as<cudf::io::parquet::flatbuf::LargeListView>() const
+{
+  return type_as_LargeListView();
+}
+
+struct FieldBuilder {
+  typedef Field Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name)
+  {
+    fbb_.AddOffset(Field::VT_NAME, name);
+  }
+  void add_nullable(bool nullable)
+  {
+    fbb_.AddElement<uint8_t>(Field::VT_NULLABLE, static_cast<uint8_t>(nullable), 0);
+  }
+  void add_type_type(cudf::io::parquet::flatbuf::Type type_type)
+  {
+    fbb_.AddElement<uint8_t>(Field::VT_TYPE_TYPE, static_cast<uint8_t>(type_type), 0);
+  }
+  void add_type(::flatbuffers::Offset<void> type) { fbb_.AddOffset(Field::VT_TYPE, type); }
+  void add_dictionary(
+    ::flatbuffers::Offset<cudf::io::parquet::flatbuf::DictionaryEncoding> dictionary)
+  {
+    fbb_.AddOffset(Field::VT_DICTIONARY, dictionary);
+  }
+  void add_children(
+    ::flatbuffers::Offset<
+      ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>> children)
+  {
+    fbb_.AddOffset(Field::VT_CHILDREN, children);
+  }
+  void add_custom_metadata(
+    ::flatbuffers::Offset<
+      ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>>
+      custom_metadata)
+  {
+    fbb_.AddOffset(Field::VT_CUSTOM_METADATA, custom_metadata);
+  }
+  explicit FieldBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Field> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Field>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Field> CreateField(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+  bool nullable                                     = false,
+  cudf::io::parquet::flatbuf::Type type_type        = cudf::io::parquet::flatbuf::Type_NONE,
+  ::flatbuffers::Offset<void> type                  = 0,
+  ::flatbuffers::Offset<cudf::io::parquet::flatbuf::DictionaryEncoding> dictionary = 0,
+  ::flatbuffers::Offset<
+    ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>> children = 0,
+  ::flatbuffers::Offset<::flatbuffers::Vector<
+    ::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>> custom_metadata             = 0)
+{
+  FieldBuilder builder_(_fbb);
+  builder_.add_custom_metadata(custom_metadata);
+  builder_.add_children(children);
+  builder_.add_dictionary(dictionary);
+  builder_.add_type(type);
+  builder_.add_name(name);
+  builder_.add_type_type(type_type);
+  builder_.add_nullable(nullable);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Field> CreateFieldDirect(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  const char* name                           = nullptr,
+  bool nullable                              = false,
+  cudf::io::parquet::flatbuf::Type type_type = cudf::io::parquet::flatbuf::Type_NONE,
+  ::flatbuffers::Offset<void> type           = 0,
+  ::flatbuffers::Offset<cudf::io::parquet::flatbuf::DictionaryEncoding> dictionary      = 0,
+  const std::vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>* children = nullptr,
+  const std::vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>* custom_metadata =
+    nullptr)
+{
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto children__ =
+    children
+      ? _fbb.CreateVector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>(*children)
+      : 0;
+  auto custom_metadata__ =
+    custom_metadata
+      ? _fbb.CreateVector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>(
+          *custom_metadata)
+      : 0;
+  return cudf::io::parquet::flatbuf::CreateField(
+    _fbb, name__, nullable, type_type, type, dictionary, children__, custom_metadata__);
+}
+
+/// ----------------------------------------------------------------------
+/// A Schema describes the columns in a row batch
+struct Schema FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SchemaBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ENDIANNESS      = 4,
+    VT_FIELDS          = 6,
+    VT_CUSTOM_METADATA = 8,
+    VT_FEATURES        = 10
+  };
+  /// endianness of the buffer
+  /// it is Little Endian by default
+  /// if endianness doesn't match the underlying system then the vectors need to be converted
+  cudf::io::parquet::flatbuf::Endianness endianness() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::Endianness>(GetField<int16_t>(VT_ENDIANNESS, 0));
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>* fields()
+    const
+  {
+    return GetPointer<
+      const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>*>(
+      VT_FIELDS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>*
+  custom_metadata() const
+  {
+    return GetPointer<
+      const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>*>(
+      VT_CUSTOM_METADATA);
+  }
+  /// Features used in the stream/file.
+  const ::flatbuffers::Vector<int64_t>* features() const
+  {
+    return GetPointer<const ::flatbuffers::Vector<int64_t>*>(VT_FEATURES);
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_ENDIANNESS, 2) &&
+           VerifyOffset(verifier, VT_FIELDS) && verifier.VerifyVector(fields()) &&
+           verifier.VerifyVectorOfTables(fields()) && VerifyOffset(verifier, VT_CUSTOM_METADATA) &&
+           verifier.VerifyVector(custom_metadata()) &&
+           verifier.VerifyVectorOfTables(custom_metadata()) &&
+           VerifyOffset(verifier, VT_FEATURES) && verifier.VerifyVector(features()) &&
+           verifier.EndTable();
+  }
+};
+
+struct SchemaBuilder {
+  typedef Schema Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_endianness(cudf::io::parquet::flatbuf::Endianness endianness)
+  {
+    fbb_.AddElement<int16_t>(Schema::VT_ENDIANNESS, static_cast<int16_t>(endianness), 0);
+  }
+  void add_fields(
+    ::flatbuffers::Offset<
+      ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>> fields)
+  {
+    fbb_.AddOffset(Schema::VT_FIELDS, fields);
+  }
+  void add_custom_metadata(
+    ::flatbuffers::Offset<
+      ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>>
+      custom_metadata)
+  {
+    fbb_.AddOffset(Schema::VT_CUSTOM_METADATA, custom_metadata);
+  }
+  void add_features(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> features)
+  {
+    fbb_.AddOffset(Schema::VT_FEATURES, features);
+  }
+  explicit SchemaBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Schema> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Schema>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Schema> CreateSchema(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::Endianness endianness = cudf::io::parquet::flatbuf::Endianness_Little,
+  ::flatbuffers::Offset<
+    ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>> fields = 0,
+  ::flatbuffers::Offset<::flatbuffers::Vector<
+    ::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>> custom_metadata           = 0,
+  ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> features                            = 0)
+{
+  SchemaBuilder builder_(_fbb);
+  builder_.add_features(features);
+  builder_.add_custom_metadata(custom_metadata);
+  builder_.add_fields(fields);
+  builder_.add_endianness(endianness);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Schema> CreateSchemaDirect(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::Endianness endianness = cudf::io::parquet::flatbuf::Endianness_Little,
+  const std::vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>* fields = nullptr,
+  const std::vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>* custom_metadata =
+    nullptr,
+  const std::vector<int64_t>* features = nullptr)
+{
+  auto fields__ =
+    fields ? _fbb.CreateVector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>(*fields)
+           : 0;
+  auto custom_metadata__ =
+    custom_metadata
+      ? _fbb.CreateVector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>(
+          *custom_metadata)
+      : 0;
+  auto features__ = features ? _fbb.CreateVector<int64_t>(*features) : 0;
+  return cudf::io::parquet::flatbuf::CreateSchema(
+    _fbb, endianness, fields__, custom_metadata__, features__);
+}
+
+inline bool VerifyType(::flatbuffers::Verifier& verifier, const void* obj, Type type)
+{
+  switch (type) {
+    case Type_NONE: {
+      return true;
+    }
+    case Type_Null: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Null*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Int: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Int*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_FloatingPoint: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::FloatingPoint*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Binary: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Binary*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Utf8: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Utf8*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Bool: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Bool*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Decimal: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Decimal*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Date: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Date*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Time: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Time*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Timestamp: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Timestamp*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Interval: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Interval*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_List: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::List*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Struct_: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Struct_*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Union: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Union*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_FixedSizeBinary: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::FixedSizeBinary*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_FixedSizeList: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::FixedSizeList*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Map: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Map*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Duration: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Duration*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_LargeBinary: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::LargeBinary*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_LargeUtf8: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::LargeUtf8*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_LargeList: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::LargeList*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_RunEndEncoded: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::RunEndEncoded*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_BinaryView: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::BinaryView*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Utf8View: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Utf8View*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_ListView: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::ListView*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_LargeListView: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::LargeListView*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyTypeVector(::flatbuffers::Verifier& verifier,
+                             const ::flatbuffers::Vector<::flatbuffers::Offset<void>>* values,
+                             const ::flatbuffers::Vector<uint8_t>* types)
+{
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyType(verifier, values->Get(i), types->GetEnum<Type>(i))) { return false; }
+  }
+  return true;
+}
+
+inline const cudf::io::parquet::flatbuf::Schema* GetSchema(const void* buf)
+{
+  return ::flatbuffers::GetRoot<cudf::io::parquet::flatbuf::Schema>(buf);
+}
+
+inline const cudf::io::parquet::flatbuf::Schema* GetSizePrefixedSchema(const void* buf)
+{
+  return ::flatbuffers::GetSizePrefixedRoot<cudf::io::parquet::flatbuf::Schema>(buf);
+}
+
+inline bool VerifySchemaBuffer(::flatbuffers::Verifier& verifier)
+{
+  return verifier.VerifyBuffer<cudf::io::parquet::flatbuf::Schema>(nullptr);
+}
+
+inline bool VerifySizePrefixedSchemaBuffer(::flatbuffers::Verifier& verifier)
+{
+  return verifier.VerifySizePrefixedBuffer<cudf::io::parquet::flatbuf::Schema>(nullptr);
+}
+
+inline void FinishSchemaBuffer(::flatbuffers::FlatBufferBuilder& fbb,
+                               ::flatbuffers::Offset<cudf::io::parquet::flatbuf::Schema> root)
+{
+  fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedSchemaBuffer(
+  ::flatbuffers::FlatBufferBuilder& fbb,
+  ::flatbuffers::Offset<cudf::io::parquet::flatbuf::Schema> root)
+{
+  fbb.FinishSizePrefixed(root);
+}
+
+}  // namespace flatbuf
+}  // namespace parquet
+}  // namespace io
+}  // namespace cudf
+
+#endif  // FLATBUFFERS_GENERATED_SCHEMA_CUDF_IO_PARQUET_FLATBUF_H_
diff --git a/cpp/src/io/parquet/ipc/schema/Message.fbs b/cpp/src/io/parquet/ipc/schema/Message.fbs
new file mode 100644
index 00000000000..25534410597
--- /dev/null
+++ b/cpp/src/io/parquet/ipc/schema/Message.fbs
@@ -0,0 +1,176 @@
+//
+// Copyright (c) 2024, NVIDIA CORPORATION.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+//
+// Portions of this file are derived from Apache's Arrow project at
+// https://github.com/apache/arrow, original license text below.
+//
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+include "Schema.fbs";
+
+namespace cudf.io.parquet.flatbuf;
+
+/// ----------------------------------------------------------------------
+/// Data structures for describing a table row batch (a collection of
+/// equal-length Arrow arrays)
+
+/// Metadata about a field at some level of a nested type tree (but not
+/// its children).
+///
+/// For example, a List<Int16> with values `[[1, 2, 3], null, [4], [5, 6], null]`
+/// would have {length: 5, null_count: 2} for its List node, and {length: 6,
+/// null_count: 0} for its Int16 node, as separate FieldNode structs
+struct FieldNode {
+  /// The number of value slots in the Arrow array at this level of a nested
+  /// tree
+  length: long;
+
+  /// The number of observed nulls. Fields with null_count == 0 may choose not
+  /// to write their physical validity bitmap out as a materialized buffer,
+  /// instead setting the length of the bitmap buffer to 0.
+  null_count: long;
+}
+
+enum CompressionType:byte {
+  // LZ4 frame format, for portability, as provided by lz4frame.h or wrappers
+  // thereof. Not to be confused with "raw" (also called "block") format
+  // provided by lz4.h
+  LZ4_FRAME,
+
+  // Zstandard
+  ZSTD
+}
+
+/// Provided for forward compatibility in case we need to support different
+/// strategies for compressing the IPC message body (like whole-body
+/// compression rather than buffer-level) in the future
+enum BodyCompressionMethod:byte {
+  /// Each constituent buffer is first compressed with the indicated
+  /// compressor, and then written with the uncompressed length in the first 8
+  /// bytes as a 64-bit little-endian signed integer followed by the compressed
+  /// buffer bytes (and then padding as required by the protocol). The
+  /// uncompressed length may be set to -1 to indicate that the data that
+  /// follows is not compressed, which can be useful for cases where
+  /// compression does not yield appreciable savings.
+  BUFFER
+}
+
+/// Optional compression for the memory buffers constituting IPC message
+/// bodies. Intended for use with RecordBatch but could be used for other
+/// message types
+table BodyCompression {
+  /// Compressor library.
+  /// For LZ4_FRAME, each compressed buffer must consist of a single frame.
+  codec: CompressionType = LZ4_FRAME;
+
+  /// Indicates the way the record batch body was compressed
+  method: BodyCompressionMethod = BUFFER;
+}
+
+/// A data header describing the shared memory layout of a "record" or "row"
+/// batch. Some systems call this a "row batch" internally and others a "record
+/// batch".
+table RecordBatch {
+  /// number of records / rows. The arrays in the batch should all have this
+  /// length
+  length: long;
+
+  /// Nodes correspond to the pre-ordered flattened logical schema
+  nodes: [FieldNode];
+
+  /// Buffers correspond to the pre-ordered flattened buffer tree
+  ///
+  /// The number of buffers appended to this list depends on the schema. For
+  /// example, most primitive arrays will have 2 buffers, 1 for the validity
+  /// bitmap and 1 for the values. For struct arrays, there will only be a
+  /// single buffer for the validity (nulls) bitmap
+  buffers: [Buffer];
+
+  /// Optional compression of the message body
+  compression: BodyCompression;
+
+  /// Some types such as Utf8View are represented using a variable number of buffers.
+  /// For each such Field in the pre-ordered flattened logical schema, there will be
+  /// an entry in variadicBufferCounts to indicate the number of number of variadic
+  /// buffers which belong to that Field in the current RecordBatch.
+  ///
+  /// For example, the schema
+  ///     col1: Struct<alpha: Int32, beta: BinaryView, gamma: Float64>
+  ///     col2: Utf8View
+  /// contains two Fields with variadic buffers so variadicBufferCounts will have
+  /// two entries, the first counting the variadic buffers of `col1.beta` and the
+  /// second counting `col2`'s.
+  ///
+  /// This field may be omitted if and only if the schema contains no Fields with
+  /// a variable number of buffers, such as BinaryView and Utf8View.
+  variadicBufferCounts: [long];
+}
+
+/// For sending dictionary encoding information. Any Field can be
+/// dictionary-encoded, but in this case none of its children may be
+/// dictionary-encoded.
+/// There is one vector / column per dictionary, but that vector / column
+/// may be spread across multiple dictionary batches by using the isDelta
+/// flag
+
+table DictionaryBatch {
+  id: long;
+  data: RecordBatch;
+
+  /// If isDelta is true the values in the dictionary are to be appended to a
+  /// dictionary with the indicated id. If isDelta is false this dictionary
+  /// should replace the existing dictionary.
+  isDelta: bool = false;
+}
+
+/// ----------------------------------------------------------------------
+/// The root Message type
+
+/// This union enables us to easily send different message types without
+/// redundant storage, and in the future we can easily add new message types.
+///
+/// Arrow implementations do not need to implement all of the message types,
+/// which may include experimental metadata types. For maximum compatibility,
+/// it is best to send data using RecordBatch
+union MessageHeader {
+  Schema
+}
+
+table Message {
+  version: cudf.io.parquet.flatbuf.MetadataVersion;
+  header: MessageHeader;
+  bodyLength: long;
+  custom_metadata: [ KeyValue ];
+}
+
+root_type Message;
diff --git a/cpp/src/io/parquet/ipc/schema/Schema.fbs b/cpp/src/io/parquet/ipc/schema/Schema.fbs
new file mode 100644
index 00000000000..5f66e7bbd5e
--- /dev/null
+++ b/cpp/src/io/parquet/ipc/schema/Schema.fbs
@@ -0,0 +1,591 @@
+//
+// Copyright (c) 2024, NVIDIA CORPORATION.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+//
+// Portions of this file are derived from Apache's Arrow project at
+// https://github.com/apache/arrow, original license text below.
+//
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// Logical types, vector layouts, and schemas
+
+/// Format Version History.
+/// Version 1.0 - Forward and backwards compatibility guaranteed.
+/// Version 1.1 - Add Decimal256.
+/// Version 1.2 - Add Interval MONTH_DAY_NANO.
+/// Version 1.3 - Add Run-End Encoded.
+/// Version 1.4 - Add BinaryView, Utf8View, variadicBufferCounts, ListView, and
+/// LargeListView.
+
+namespace cudf.io.parquet.flatbuf;
+
+enum MetadataVersion:short {
+  /// 0.1.0 (October 2016).
+  V1,
+
+  /// 0.2.0 (February 2017). Non-backwards compatible with V1.
+  V2,
+
+  /// 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2.
+  V3,
+
+  /// >= 0.8.0 (December 2017). Non-backwards compatible with V3.
+  V4,
+
+  /// >= 1.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4
+  /// metadata and IPC messages). Implementations are recommended to provide a
+  /// V4 compatibility mode with V5 format changes disabled.
+  ///
+  /// Incompatible changes between V4 and V5:
+  /// - Union buffer layout has changed. In V5, Unions don't have a validity
+  ///   bitmap buffer.
+  V5,
+}
+
+/// Represents Arrow Features that might not have full support
+/// within implementations. This is intended to be used in
+/// two scenarios:
+///  1.  A mechanism for readers of Arrow Streams
+///      and files to understand that the stream or file makes
+///      use of a feature that isn't supported or unknown to
+///      the implementation (and therefore can meet the Arrow
+///      forward compatibility guarantees).
+///  2.  A means of negotiating between a client and server
+///      what features a stream is allowed to use. The enums
+///      values here are intended to represent higher level
+///      features, additional details maybe negotiated
+///      with key-value pairs specific to the protocol.
+///
+/// Enums added to this list should be assigned power-of-two values
+/// to facilitate exchanging and comparing bitmaps for supported
+/// features.
+enum Feature : long {
+  /// Needed to make flatbuffers happy.
+  UNUSED = 0,
+  /// The stream makes use of multiple full dictionaries with the
+  /// same ID and assumes clients implement dictionary replacement
+  /// correctly.
+  DICTIONARY_REPLACEMENT = 1,
+  /// The stream makes use of compressed bodies as described
+  /// in Message.fbs.
+  COMPRESSED_BODY = 2
+}
+
+/// These are stored in the flatbuffer in the Type union below
+
+table Null {
+}
+
+/// A Struct_ in the flatbuffer metadata is the same as an Arrow Struct
+/// (according to the physical memory layout). We used Struct_ here as
+/// Struct is a reserved word in Flatbuffers
+table Struct_ {
+}
+
+table List {
+}
+
+/// Same as List, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+table LargeList {
+}
+
+/// Represents the same logical types that List can, but contains offsets and
+/// sizes allowing for writes in any order and sharing of child values among
+/// list values.
+table ListView {
+}
+
+/// Same as ListView, but with 64-bit offsets and sizes, allowing to represent
+/// extremely large data values.
+table LargeListView {
+}
+
+table FixedSizeList {
+  /// Number of list items per value
+  listSize: int;
+}
+
+/// A Map is a logical nested type that is represented as
+///
+/// List<entries: Struct<key: K, value: V>>
+///
+/// In this layout, the keys and values are each respectively contiguous. We do
+/// not constrain the key and value types, so the application is responsible
+/// for ensuring that the keys are hashable and unique. Whether the keys are sorted
+/// may be set in the metadata for this field.
+///
+/// In a field with Map type, the field has a child Struct field, which then
+/// has two children: key type and the second the value type. The names of the
+/// child fields may be respectively "entries", "key", and "value", but this is
+/// not enforced.
+///
+/// Map
+/// ```text
+///   - child[0] entries: Struct
+///     - child[0] key: K
+///     - child[1] value: V
+/// ```
+/// Neither the "entries" field nor the "key" field may be nullable.
+///
+/// The metadata is structured so that Arrow systems without special handling
+/// for Map can make Map an alias for List. The "layout" attribute for the Map
+/// field must have the same contents as a List.
+table Map {
+  /// Set to true if the keys within each value are sorted
+  keysSorted: bool;
+}
+
+enum UnionMode:short { Sparse, Dense }
+
+/// A union is a complex type with children in Field
+/// By default ids in the type vector refer to the offsets in the children
+/// optionally typeIds provides an indirection between the child offset and the type id
+/// for each child `typeIds[offset]` is the id used in the type vector
+table Union {
+  mode: UnionMode;
+  typeIds: [ int ]; // optional, describes typeid of each child.
+}
+
+table Int {
+  bitWidth: int; // restricted to 8, 16, 32, and 64 in v1
+  is_signed: bool;
+}
+
+enum Precision:short {HALF, SINGLE, DOUBLE}
+
+table FloatingPoint {
+  precision: Precision;
+}
+
+/// Unicode with UTF-8 encoding
+table Utf8 {
+}
+
+/// Opaque binary data
+table Binary {
+}
+
+/// Same as Utf8, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+table LargeUtf8 {
+}
+
+/// Same as Binary, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+table LargeBinary {
+}
+
+/// Logically the same as Utf8, but the internal representation uses a view
+/// struct that contains the string length and either the string's entire data
+/// inline (for small strings) or an inlined prefix, an index of another buffer,
+/// and an offset pointing to a slice in that buffer (for non-small strings).
+///
+/// Since it uses a variable number of data buffers, each Field with this type
+/// must have a corresponding entry in `variadicBufferCounts`.
+table Utf8View {
+}
+
+/// Logically the same as Binary, but the internal representation uses a view
+/// struct that contains the string length and either the string's entire data
+/// inline (for small strings) or an inlined prefix, an index of another buffer,
+/// and an offset pointing to a slice in that buffer (for non-small strings).
+///
+/// Since it uses a variable number of data buffers, each Field with this type
+/// must have a corresponding entry in `variadicBufferCounts`.
+table BinaryView {
+}
+
+
+table FixedSizeBinary {
+  /// Number of bytes per value
+  byteWidth: int;
+}
+
+table Bool {
+}
+
+/// Contains two child arrays, run_ends and values.
+/// The run_ends child array must be a 16/32/64-bit integer array
+/// which encodes the indices at which the run with the value in
+/// each corresponding index in the values child array ends.
+/// Like list/struct types, the value array can be of any type.
+table RunEndEncoded {
+}
+
+/// Exact decimal value represented as an integer value in two's
+/// complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers
+/// are used. The representation uses the endianness indicated
+/// in the Schema.
+table Decimal {
+  /// Total number of decimal digits
+  precision: int;
+
+  /// Number of digits after the decimal point "."
+  scale: int;
+
+  /// Number of bits per value. The only accepted widths are 128 and 256.
+  /// We use bitWidth for consistency with Int::bitWidth.
+  bitWidth: int = 128;
+}
+
+enum DateUnit: short {
+  DAY,
+  MILLISECOND
+}
+
+/// Date is either a 32-bit or 64-bit signed integer type representing an
+/// elapsed time since UNIX epoch (1970-01-01), stored in either of two units:
+///
+/// * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no
+///   leap seconds), where the values are evenly divisible by 86400000
+/// * Days (32 bits) since the UNIX epoch
+table Date {
+  unit: DateUnit = MILLISECOND;
+}
+
+enum TimeUnit: short { SECOND, MILLISECOND, MICROSECOND, NANOSECOND }
+
+/// Time is either a 32-bit or 64-bit signed integer type representing an
+/// elapsed time since midnight, stored in either of four units: seconds,
+/// milliseconds, microseconds or nanoseconds.
+///
+/// The integer `bitWidth` depends on the `unit` and must be one of the following:
+/// * SECOND and MILLISECOND: 32 bits
+/// * MICROSECOND and NANOSECOND: 64 bits
+///
+/// The allowed values are between 0 (inclusive) and 86400 (=24*60*60) seconds
+/// (exclusive), adjusted for the time unit (for example, up to 86400000
+/// exclusive for the MILLISECOND unit).
+/// This definition doesn't allow for leap seconds. Time values from
+/// measurements with leap seconds will need to be corrected when ingesting
+/// into Arrow (for example by replacing the value 86400 with 86399).
+table Time {
+  unit: TimeUnit = MILLISECOND;
+  bitWidth: int = 32;
+}
+
+/// Timestamp is a 64-bit signed integer representing an elapsed time since a
+/// fixed epoch, stored in either of four units: seconds, milliseconds,
+/// microseconds or nanoseconds, and is optionally annotated with a timezone.
+///
+/// Timestamp values do not include any leap seconds (in other words, all
+/// days are considered 86400 seconds long).
+///
+/// Timestamps with a non-empty timezone
+/// ------------------------------------
+///
+/// If a Timestamp column has a non-empty timezone value, its epoch is
+/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
+/// (the Unix epoch), regardless of the Timestamp's own timezone.
+///
+/// Therefore, timestamp values with a non-empty timezone correspond to
+/// physical points in time together with some additional information about
+/// how the data was obtained and/or how to display it (the timezone).
+///
+///   For example, the timestamp value 0 with the timezone string "Europe/Paris"
+///   corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
+///   application may prefer to display it as "January 1st 1970, 01h00" in
+///   the Europe/Paris timezone (which is the same physical point in time).
+///
+/// One consequence is that timestamp values with a non-empty timezone
+/// can be compared and ordered directly, since they all share the same
+/// well-known point of reference (the Unix epoch).
+///
+/// Timestamps with an unset / empty timezone
+/// -----------------------------------------
+///
+/// If a Timestamp column has no timezone value, its epoch is
+/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone.
+///
+/// Therefore, timestamp values without a timezone cannot be meaningfully
+/// interpreted as physical points in time, but only as calendar / clock
+/// indications ("wall clock time") in an unspecified timezone.
+///
+///   For example, the timestamp value 0 with an empty timezone string
+///   corresponds to "January 1st 1970, 00h00" in an unknown timezone: there
+///   is not enough information to interpret it as a well-defined physical
+///   point in time.
+///
+/// One consequence is that timestamp values without a timezone cannot
+/// be reliably compared or ordered, since they may have different points of
+/// reference.  In particular, it is *not* possible to interpret an unset
+/// or empty timezone as the same as "UTC".
+///
+/// Conversion between timezones
+/// ----------------------------
+///
+/// If a Timestamp column has a non-empty timezone, changing the timezone
+/// to a different non-empty value is a metadata-only operation:
+/// the timestamp values need not change as their point of reference remains
+/// the same (the Unix epoch).
+///
+/// However, if a Timestamp column has no timezone value, changing it to a
+/// non-empty value requires to think about the desired semantics.
+/// One possibility is to assume that the original timestamp values are
+/// relative to the epoch of the timezone being set; timestamp values should
+/// then adjusted to the Unix epoch (for example, changing the timezone from
+/// empty to "Europe/Paris" would require converting the timestamp values
+/// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
+/// nevertheless correct).
+///
+/// Guidelines for encoding data from external libraries
+/// ----------------------------------------------------
+///
+/// Date & time libraries often have multiple different data types for temporal
+/// data. In order to ease interoperability between different implementations the
+/// Arrow project has some recommendations for encoding these types into a Timestamp
+/// column.
+///
+/// An "instant" represents a physical point in time that has no relevant timezone
+/// (for example, astronomical data). To encode an instant, use a Timestamp with
+/// the timezone string set to "UTC", and make sure the Timestamp values
+/// are relative to the UTC epoch (January 1st 1970, midnight).
+///
+/// A "zoned date-time" represents a physical point in time annotated with an
+/// informative timezone (for example, the timezone in which the data was
+/// recorded).  To encode a zoned date-time, use a Timestamp with the timezone
+/// string set to the name of the timezone, and make sure the Timestamp values
+/// are relative to the UTC epoch (January 1st 1970, midnight).
+///
+///  (There is some ambiguity between an instant and a zoned date-time with the
+///   UTC timezone.  Both of these are stored the same in Arrow.  Typically,
+///   this distinction does not matter.  If it does, then an application should
+///   use custom metadata or an extension type to distinguish between the two cases.)
+///
+/// An "offset date-time" represents a physical point in time combined with an
+/// explicit offset from UTC.  To encode an offset date-time, use a Timestamp
+/// with the timezone string set to the numeric timezone offset string
+/// (e.g. "+03:00"), and make sure the Timestamp values are relative to
+/// the UTC epoch (January 1st 1970, midnight).
+///
+/// A "naive date-time" (also called "local date-time" in some libraries)
+/// represents a wall clock time combined with a calendar date, but with
+/// no indication of how to map this information to a physical point in time.
+/// Naive date-times must be handled with care because of this missing
+/// information, and also because daylight saving time (DST) may make
+/// some values ambiguous or nonexistent. A naive date-time may be
+/// stored as a struct with Date and Time fields. However, it may also be
+/// encoded into a Timestamp column with an empty timezone. The timestamp
+/// values should be computed "as if" the timezone of the date-time values
+/// was UTC; for example, the naive date-time "January 1st 1970, 00h00" would
+/// be encoded as timestamp value 0.
+table Timestamp {
+  unit: TimeUnit;
+
+  /// The timezone is an optional string indicating the name of a timezone,
+  /// one of:
+  ///
+  /// * As used in the Olson timezone database (the "tz database" or
+  ///   "tzdata"), such as "America/New_York".
+  /// * An absolute timezone offset of the form "+XX:XX" or "-XX:XX",
+  ///   such as "+07:30".
+  ///
+  /// Whether a timezone string is present indicates different semantics about
+  /// the data (see above).
+  timezone: string;
+}
+
+enum IntervalUnit: short { YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO}
+// A "calendar" interval which models types that don't necessarily
+// have a precise duration without the context of a base timestamp (e.g.
+// days can differ in length during day light savings time transitions).
+// All integers in the types below are stored in the endianness indicated
+// by the schema.
+//
+// YEAR_MONTH - Indicates the number of elapsed whole months, stored as
+//   4-byte signed integers.
+// DAY_TIME - Indicates the number of elapsed days and milliseconds (no leap seconds),
+//   stored as 2 contiguous 32-bit signed integers (8-bytes in total). Support
+//   of this IntervalUnit is not required for full arrow compatibility.
+// MONTH_DAY_NANO - A triple of the number of elapsed months, days, and nanoseconds.
+//  The values are stored contiguously in 16-byte blocks. Months and days are
+//  encoded as 32-bit signed integers and nanoseconds is encoded as a 64-bit
+//  signed integer. Nanoseconds does not allow for leap seconds. Each field is
+//  independent (e.g. there is no constraint that nanoseconds have the same
+//  sign as days or that the quantity of nanoseconds represents less than a
+//  day's worth of time).
+table Interval {
+  unit: IntervalUnit;
+}
+
+// An absolute length of time unrelated to any calendar artifacts.
+//
+// For the purposes of Arrow Implementations, adding this value to a Timestamp
+// ("t1") naively (i.e. simply summing the two number) is acceptable even
+// though in some cases the resulting Timestamp (t2) would not account for
+// leap-seconds during the elapsed time between "t1" and "t2".  Similarly,
+// representing the difference between two Unix timestamp is acceptable, but
+// would yield a value that is possibly a few seconds off from the true elapsed
+// time.
+//
+//  The resolution defaults to millisecond, but can be any of the other
+//  supported TimeUnit values as with Timestamp and Time types.  This type is
+//  always represented as an 8-byte integer.
+table Duration {
+  unit: TimeUnit = MILLISECOND;
+}
+
+/// ----------------------------------------------------------------------
+/// Top-level Type value, enabling extensible type-specific metadata. We can
+/// add new logical types to Type without breaking backwards compatibility
+
+union Type {
+  Null,
+  Int,
+  FloatingPoint,
+  Binary,
+  Utf8,
+  Bool,
+  Decimal,
+  Date,
+  Time,
+  Timestamp,
+  Interval,
+  List,
+  Struct_,
+  Union,
+  FixedSizeBinary,
+  FixedSizeList,
+  Map,
+  Duration,
+  LargeBinary,
+  LargeUtf8,
+  LargeList,
+  RunEndEncoded,
+  BinaryView,
+  Utf8View,
+  ListView,
+  LargeListView,
+}
+
+/// ----------------------------------------------------------------------
+/// user defined key value pairs to add custom metadata to arrow
+/// key namespacing is the responsibility of the user
+
+table KeyValue {
+  key: string;
+  value: string;
+}
+
+/// ----------------------------------------------------------------------
+/// Dictionary encoding metadata
+/// Maintained for forwards compatibility, in the future
+/// Dictionaries might be explicit maps between integers and values
+/// allowing for non-contiguous index values
+enum DictionaryKind : short { DenseArray }
+table DictionaryEncoding {
+  /// The known dictionary id in the application where this data is used. In
+  /// the file or streaming formats, the dictionary ids are found in the
+  /// DictionaryBatch messages
+  id: long;
+
+  /// The dictionary indices are constrained to be non-negative integers. If
+  /// this field is null, the indices must be signed int32. To maximize
+  /// cross-language compatibility and performance, implementations are
+  /// recommended to prefer signed integer types over unsigned integer types
+  /// and to avoid uint64 indices unless they are required by an application.
+  indexType: Int;
+
+  /// By default, dictionaries are not ordered, or the order does not have
+  /// semantic meaning. In some statistical, applications, dictionary-encoding
+  /// is used to represent ordered categorical data, and we provide a way to
+  /// preserve that metadata here
+  isOrdered: bool;
+
+  dictionaryKind: DictionaryKind;
+}
+
+/// ----------------------------------------------------------------------
+/// A field represents a named column in a record / row batch or child of a
+/// nested type.
+
+table Field {
+  /// Name is not required, in i.e. a List
+  name: string;
+
+  /// Whether or not this field can contain nulls. Should be true in general.
+  nullable: bool;
+
+  /// This is the type of the decoded value if the field is dictionary encoded.
+  type: Type;
+
+  /// Present only if the field is dictionary encoded.
+  dictionary: DictionaryEncoding;
+
+  /// children apply only to nested data types like Struct, List and Union. For
+  /// primitive types children will have length 0.
+  children: [ Field ];
+
+  /// User-defined metadata
+  custom_metadata: [ KeyValue ];
+}
+
+/// ----------------------------------------------------------------------
+/// Endianness of the platform producing the data
+
+enum Endianness:short { Little, Big }
+
+/// ----------------------------------------------------------------------
+/// A Buffer represents a single contiguous memory segment
+struct Buffer {
+  /// The relative offset into the shared memory page where the bytes for this
+  /// buffer starts
+  offset: long;
+
+  /// The absolute length (in bytes) of the memory buffer. The memory is found
+  /// from offset (inclusive) to offset + length (non-inclusive). When building
+  /// messages using the encapsulated IPC message, padding bytes may be written
+  /// after a buffer, but such padding bytes do not need to be accounted for in
+  /// the size here.
+  length: long;
+}
+
+/// ----------------------------------------------------------------------
+/// A Schema describes the columns in a row batch
+
+table Schema {
+
+  /// endianness of the buffer
+  /// it is Little Endian by default
+  /// if endianness doesn't match the underlying system then the vectors need to be converted
+  endianness: Endianness=Little;
+
+  fields: [Field];
+  // User-defined metadata
+  custom_metadata: [ KeyValue ];
+
+  /// Features used in the stream/file.
+  features : [ Feature ];
+}
+
+root_type Schema;
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 756726945cf..e35742c2527 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -183,6 +183,9 @@ struct SchemaElement {
   // extra cudf specific fields
   bool output_as_byte_array = false;
 
+  // cudf type determined from arrow:schema
+  thrust::optional<type_id> arrow_type;
+
   // The following fields are filled in later during schema initialization
   int max_definition_level = 0;
   int max_repetition_level = 0;
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 3af4d5cdb86..5b7c180195b 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -422,7 +422,8 @@ reader::impl::impl(std::size_t chunk_read_limit,
     _input_pass_read_limit{pass_read_limit}
 {
   // Open and parse the source dataset metadata
-  _metadata = std::make_unique<aggregate_reader_metadata>(_sources);
+  _metadata =
+    std::make_unique<aggregate_reader_metadata>(_sources, options.is_enabled_use_arrow_schema());
 
   // Override output timestamp resolution if requested
   if (options.get_timestamp_type().id() != type_id::EMPTY) {
@@ -642,8 +643,11 @@ parquet_column_schema walk_schema(aggregate_reader_metadata const* mt, int idx)
 
 parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> const> sources)
 {
+  // do not use arrow schema when reading information from parquet metadata.
+  static constexpr auto use_arrow_schema = false;
+
   // Open and parse the source dataset metadata
-  auto metadata = aggregate_reader_metadata(sources);
+  auto metadata = aggregate_reader_metadata(sources, use_arrow_schema);
 
   return parquet_metadata{parquet_schema{walk_schema(&metadata, 0)},
                           metadata.get_num_rows(),
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 68dbf532a68..dfbc8c565ad 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -17,13 +17,21 @@
 #include "reader_impl_helpers.hpp"
 
 #include "io/parquet/parquet.hpp"
+#include "io/utilities/base64_utilities.hpp"
 #include "io/utilities/row_selection.hpp"
+#include "ipc/Message_generated.h"
+#include "ipc/Schema_generated.h"
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
 
 #include <numeric>
 #include <regex>
 
 namespace cudf::io::parquet::detail {
 
+namespace flatbuf = cudf::io::parquet::flatbuf;
+
 namespace {
 
 thrust::optional<LogicalType> converted_to_logical_type(SchemaElement const& schema)
@@ -66,8 +74,9 @@ type_id to_type_id(SchemaElement const& schema,
                    bool strings_to_categorical,
                    type_id timestamp_type_id)
 {
-  auto const physical = schema.type;
-  auto logical_type   = schema.logical_type;
+  auto const physical_type = schema.type;
+  auto const arrow_type    = schema.arrow_type;
+  auto logical_type        = schema.logical_type;
 
   // sanity check, but not worth failing over
   if (schema.converted_type.has_value() and not logical_type.has_value()) {
@@ -75,6 +84,16 @@ type_id to_type_id(SchemaElement const& schema,
     logical_type = converted_to_logical_type(schema);
   }
 
+  // check if have set the type through arrow schema?
+  if (arrow_type.has_value()) {
+    // is it duration type? i.e. phyical_type == INT64 and no logical/converted types
+    if (physical_type == Type::INT64 and not logical_type.has_value()) {
+      return arrow_type.value();
+    }
+    // should warn but not fail.
+    CUDF_LOG_WARN("Indeterminable arrow type encountered");
+  }
+
   if (logical_type.has_value()) {
     switch (logical_type->type) {
       case LogicalType::INTEGER: {
@@ -113,11 +132,11 @@ type_id to_type_id(SchemaElement const& schema,
 
       case LogicalType::DECIMAL: {
         int32_t const decimal_precision = logical_type->precision();
-        if (physical == INT32) {
+        if (physical_type == INT32) {
           return type_id::DECIMAL32;
-        } else if (physical == INT64) {
+        } else if (physical_type == INT64) {
           return type_id::DECIMAL64;
-        } else if (physical == FIXED_LEN_BYTE_ARRAY) {
+        } else if (physical_type == FIXED_LEN_BYTE_ARRAY) {
           if (schema.type_length <= static_cast<int32_t>(sizeof(int32_t))) {
             return type_id::DECIMAL32;
           } else if (schema.type_length <= static_cast<int32_t>(sizeof(int64_t))) {
@@ -125,7 +144,7 @@ type_id to_type_id(SchemaElement const& schema,
           } else if (schema.type_length <= static_cast<int32_t>(sizeof(__int128_t))) {
             return type_id::DECIMAL128;
           }
-        } else if (physical == BYTE_ARRAY) {
+        } else if (physical_type == BYTE_ARRAY) {
           CUDF_EXPECTS(decimal_precision <= MAX_DECIMAL128_PRECISION, "Invalid decimal precision");
           if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
             return type_id::DECIMAL32;
@@ -158,7 +177,7 @@ type_id to_type_id(SchemaElement const& schema,
 
   // Physical storage type supported by Parquet; controls the on-disk storage
   // format in combination with the encoding type.
-  switch (physical) {
+  switch (physical_type) {
     case BOOLEAN: return type_id::BOOL8;
     case INT32: return type_id::INT32;
     case INT64: return type_id::INT64;
@@ -516,7 +535,7 @@ void aggregate_reader_metadata::column_info_for_row_group(row_group_info& rg_inf
 }
 
 aggregate_reader_metadata::aggregate_reader_metadata(
-  host_span<std::unique_ptr<datasource> const> sources)
+  host_span<std::unique_ptr<datasource> const> sources, bool use_arrow_schema)
   : per_file_metadata(metadatas_from_sources(sources)),
     keyval_maps(collect_keyval_metadata()),
     num_rows(calc_num_rows()),
@@ -537,6 +556,307 @@ aggregate_reader_metadata::aggregate_reader_metadata(
       CUDF_EXPECTS(schema == pfm.schema, "All sources must have the same schema");
     }
   }
+
+  // Collect and apply arrow:schema from Parquet's key value metadata section
+  if (use_arrow_schema) { apply_arrow_schema(); }
+
+  // Erase "ARROW:schema" from the output pfm if exists
+  std::for_each(
+    keyval_maps.begin(), keyval_maps.end(), [](auto& pfm) { pfm.erase("ARROW:schema"); });
+}
+
+arrow_schema_data_types aggregate_reader_metadata::collect_arrow_schema() const
+{
+  // Check the key_value metadata for ARROW:schema, decode and walk it
+  // Function to convert from flatbuf::duration type to cudf::type_id
+  auto const duration_from_flatbuffer = [](flatbuf::Duration const* duration) {
+    // TODO: we only need this for arrow::DurationType for now. Else, we can take in a
+    // void ptr and typecast it to the corresponding type based on the type_id.
+    auto fb_unit = duration->unit();
+    switch (fb_unit) {
+      case flatbuf::TimeUnit::TimeUnit_SECOND:
+        return cudf::data_type{cudf::type_id::DURATION_SECONDS};
+      case flatbuf::TimeUnit::TimeUnit_MILLISECOND:
+        return cudf::data_type{cudf::type_id::DURATION_MILLISECONDS};
+      case flatbuf::TimeUnit::TimeUnit_MICROSECOND:
+        return cudf::data_type{cudf::type_id::DURATION_MICROSECONDS};
+      case flatbuf::TimeUnit::TimeUnit_NANOSECOND:
+        return cudf::data_type{cudf::type_id::DURATION_NANOSECONDS};
+      default: return cudf::data_type{};
+    }
+  };
+
+  // variable that tracks if an arrow_type specific column is seen
+  // in the walk
+  bool arrow_type_col_seen = false;
+
+  // Lambda function to walk a field and its children in DFS manner and
+  // return boolean walk success status
+  std::function<bool(flatbuf::Field const* const, arrow_schema_data_types&)> walk_field =
+    [&walk_field, &duration_from_flatbuffer, &arrow_type_col_seen](
+      flatbuf::Field const* const field, arrow_schema_data_types& schema_elem) {
+      // DFS: recursively walk over the children first
+      auto const field_children = field->children();
+
+      if (field_children != nullptr) {
+        auto schema_children = std::vector<arrow_schema_data_types>(field->children()->size());
+
+        if (not std::all_of(
+              thrust::make_counting_iterator(0),
+              thrust::make_counting_iterator(static_cast<int32_t>(field_children->size())),
+              [&](auto const& idx) {
+                return walk_field((*field_children)[idx], schema_children[idx]);
+              })) {
+          return false;
+        }
+        // arrow and parquet schemas are structured slightly differently for list type fields. list
+        // type fields in arrow are structured as: "field:list<element>" vs structured as:
+        // "field:list.element" in Parquet. To handle this, whenever we encounter a list type field,
+        // we add a dummy node "field.list" to the end of current children and move the current
+        // children (".element") to it.
+        switch (field->type_type()) {
+          case flatbuf::Type::Type_List:
+          case flatbuf::Type::Type_LargeList:
+          case flatbuf::Type::Type_FixedSizeList:
+            schema_elem.children.emplace_back(arrow_schema_data_types{std::move(schema_children)});
+            break;
+          default: schema_elem.children = std::move(schema_children); break;
+        }
+      }
+
+      // Walk the field itself
+      if (field->type_type() == flatbuf::Type::Type_Duration) {
+        auto type_data = field->type_as_Duration();
+        if (type_data != nullptr) {
+          auto name = (field->name()) ? field->name()->str() : "";
+          // set the schema_elem type to duration type
+          schema_elem.type = duration_from_flatbuffer(type_data);
+          arrow_type_col_seen |= (schema_elem.type.id() != type_id::EMPTY);
+        } else {
+          CUDF_LOG_ERROR("Parquet reader encountered an invalid type_data pointer.",
+                         "arrow:schema not processed.");
+          return false;
+        }
+      }
+      return true;
+    };
+
+  // TODO: Should we check if any file has the "ARROW:schema" key
+  // Or if all files have the same "ARROW:schema"?
+  auto const it = keyval_maps[0].find("ARROW:schema");
+  if (it == keyval_maps[0].end()) { return {}; }
+
+  // Decode the base64 encoded ipc message string
+  // Note: Store the output from base64_decode in the lvalue here and then pass
+  // it to decode_ipc_message. Directly passing rvalue from base64_decode to
+  // decode_ipc_message can lead to unintended nullptr dereferences.
+  auto const decoded_message = cudf::io::detail::base64_decode(it->second);
+
+  // Decode the ipc message to get an optional string_view of the ipc:Message flatbuffer
+  auto const metadata_buf = decode_ipc_message(decoded_message);
+
+  // Check if the string_view exists
+  if (not metadata_buf.has_value()) {
+    // No need to re-log error here as already logged inside decode_ipc_message
+    return {};
+  }
+
+  // Check if the decoded Message flatbuffer is valid
+  if (flatbuf::GetMessage(metadata_buf.value().data()) == nullptr) {
+    CUDF_LOG_ERROR("Parquet reader encountered an invalid ipc:Message flatbuffer pointer.",
+                   "arrow:schema not processed.");
+    return {};
+  }
+
+  // Check if the Message flatbuffer has a valid arrow:schema in its header
+  if (flatbuf::GetMessage(metadata_buf.value().data())->header_as_Schema() == nullptr) {
+    CUDF_LOG_ERROR("Parquet reader encountered an invalid arrow:schema flatbuffer pointer.",
+                   "arrow:schema not processed.");
+    return {};
+  }
+
+  // Get the vector of fields from arrow:schema flatbuffer object
+  auto const fields =
+    flatbuf::GetMessage(metadata_buf.value().data())->header_as_Schema()->fields();
+  if (fields == nullptr) {
+    CUDF_LOG_ERROR("Parquet reader encountered an invalid fields pointer.",
+                   "arrow:schema not processed.");
+    return {};
+  }
+
+  // arrow schema structure to return
+  arrow_schema_data_types schema;
+
+  // Recursively walk the arrow schema and set cudf::data_type for all duration columns
+  if (fields->size() > 0) {
+    schema.children = std::vector<arrow_schema_data_types>(fields->size());
+
+    if (not std::all_of(
+          thrust::make_counting_iterator(0),
+          thrust::make_counting_iterator(static_cast<int32_t>(fields->size())),
+          [&](auto const& idx) { return walk_field((*fields)[idx], schema.children[idx]); })) {
+      return {};
+    }
+
+    // if no arrow type column seen, return nullopt.
+    if (not arrow_type_col_seen) { return {}; }
+  }
+
+  return schema;
+}
+
+void aggregate_reader_metadata::apply_arrow_schema()
+{
+  // Collect the arrow schema from the key value section of Parquet metadata
+  auto arrow_schema_root = collect_arrow_schema();
+
+  // Check if empty arrow schema collected
+  if (arrow_schema_root.type.id() == type_id::EMPTY and arrow_schema_root.children.size() == 0) {
+    return;
+  }
+
+  // Function to verify equal num_children at each level in Parquet and arrow schemas.
+  std::function<bool(arrow_schema_data_types const&, int const)> validate_schemas =
+    [&](arrow_schema_data_types const& arrow_schema, int const schema_idx) {
+      auto& pq_schema_elem = per_file_metadata[0].schema[schema_idx];
+
+      // ensure equal number of children first to avoid any segfaults in children
+      if (pq_schema_elem.num_children == static_cast<int32_t>(arrow_schema.children.size())) {
+        // true if and only if true for all children as well
+        return std::all_of(thrust::make_zip_iterator(thrust::make_tuple(
+                             arrow_schema.children.begin(), pq_schema_elem.children_idx.begin())),
+                           thrust::make_zip_iterator(thrust::make_tuple(
+                             arrow_schema.children.end(), pq_schema_elem.children_idx.end())),
+                           [&](auto const& elem) {
+                             return validate_schemas(thrust::get<0>(elem), thrust::get<1>(elem));
+                           });
+      } else {
+        return false;
+      }
+    };
+
+  // Function to co-walk arrow and parquet schemas
+  std::function<void(arrow_schema_data_types const&, int const)> co_walk_schemas =
+    [&](arrow_schema_data_types const& arrow_schema, int const schema_idx) {
+      auto& pq_schema_elem = per_file_metadata[0].schema[schema_idx];
+      std::for_each(
+        thrust::make_zip_iterator(
+          thrust::make_tuple(arrow_schema.children.begin(), pq_schema_elem.children_idx.begin())),
+        thrust::make_zip_iterator(
+          thrust::make_tuple(arrow_schema.children.end(), pq_schema_elem.children_idx.end())),
+        [&](auto const& elem) { co_walk_schemas(thrust::get<0>(elem), thrust::get<1>(elem)); });
+
+      // true for DurationType columns only for now.
+      if (arrow_schema.type.id() != type_id::EMPTY) {
+        pq_schema_elem.arrow_type = arrow_schema.type.id();
+      }
+    };
+
+  // Get Parquet schema root
+  auto pq_schema_root = get_schema(0);
+
+  // verify equal number of children for both schemas at root level
+  if (pq_schema_root.num_children != static_cast<int32_t>(arrow_schema_root.children.size())) {
+    CUDF_LOG_ERROR("Parquet reader encountered a mismatch between Parquet and arrow schema.",
+                   "arrow:schema not processed.");
+    return;
+  }
+
+  // zip iterator to validate and co-walk the two schemas
+  auto schemas = thrust::make_zip_iterator(
+    thrust::make_tuple(arrow_schema_root.children.begin(), pq_schema_root.children_idx.begin()));
+
+  // Verify equal number of children at all sub-levels
+  if (not std::all_of(schemas, schemas + pq_schema_root.num_children, [&](auto const& elem) {
+        return validate_schemas(thrust::get<0>(elem), thrust::get<1>(elem));
+      })) {
+    CUDF_LOG_ERROR("Parquet reader encountered a mismatch between Parquet and arrow schema.",
+                   "arrow:schema not processed.");
+    return;
+  }
+
+  // All good, now co-walk schemas
+  std::for_each(schemas, schemas + pq_schema_root.num_children, [&](auto const& elem) {
+    co_walk_schemas(thrust::get<0>(elem), thrust::get<1>(elem));
+  });
+}
+
+std::optional<std::string_view> aggregate_reader_metadata::decode_ipc_message(
+  std::string_view const serialized_message) const
+{
+  // Constants copied from arrow source and renamed to match the case
+  constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_INITIAL         = sizeof(int32_t);
+  constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_METADATA_LENGTH = sizeof(int32_t);
+  constexpr int32_t IPC_CONTINUATION_TOKEN                             = -1;
+
+  // message buffer
+  auto message_buf = serialized_message.data();
+  // current message (buffer) size
+  auto message_size = static_cast<int32_t>(serialized_message.size());
+
+  // Lambda function to read and return 4 bytes as int32_t from the ipc message buffer and update
+  // buffer pointer and size
+  auto read_int32_from_ipc_message = [&]() {
+    int32_t bytes;
+    std::memcpy(&bytes, message_buf, sizeof(int32_t));
+    // Offset the message buf and reduce remaining size
+    message_buf += sizeof(int32_t);
+    message_size -= sizeof(int32_t);
+    return bytes;
+  };
+
+  // Check for empty message
+  if (message_size == 0) {
+    CUDF_LOG_ERROR("Parquet reader encountered zero length arrow:schema.",
+                   "arrow:schema not processed.");
+    return std::nullopt;
+  }
+
+  // Check for improper message size.
+  if (message_size < MESSAGE_DECODER_NEXT_REQUIRED_SIZE_INITIAL) {
+    CUDF_LOG_ERROR("Parquet reader encountered unexpected arrow:schema message length.",
+                   "arrow:schema not processed.");
+    return std::nullopt;
+  }
+
+  // Get the first 4 bytes (continuation) of the ipc message
+  // and check if it matches the expected token
+  if (read_int32_from_ipc_message() != IPC_CONTINUATION_TOKEN) {
+    CUDF_LOG_ERROR("Parquet reader encountered unexpected IPC continuation token.",
+                   "arrow:schema not processed.");
+    return std::nullopt;
+  }
+
+  // Check for improper message size after the continuation bytes.
+  if (message_size < MESSAGE_DECODER_NEXT_REQUIRED_SIZE_METADATA_LENGTH) {
+    CUDF_LOG_ERROR("Parquet reader encountered unexpected arrow:schema message length.",
+                   "arrow:schema not processed.");
+    return std::nullopt;
+  }
+
+  // Get the next 4 bytes (metadata_len) of the ipc message
+  // and check if invalid metadata length read
+  auto const metadata_len = read_int32_from_ipc_message();
+
+  // Check if the read metadata (header) length is > zero
+  if (metadata_len <= 0) {
+    CUDF_LOG_ERROR("Parquet reader encountered unexpected metadata length.",
+                   "arrow:schema not processed.");
+    return std::nullopt;
+  }
+
+  // Check if the remaining message size is smaller than the expected metadata length
+  // TODO: Since the arrow:schema message doesn't have a body,
+  // the following check may be made tighter from < to ==
+  if (message_size < metadata_len) {
+    CUDF_LOG_ERROR("Parquet reader encountered unexpected arrow:schema message length.",
+                   "arrow:schema not processed.");
+    return std::nullopt;
+  }
+
+  // All good, return the current message_buf as string_view
+  return std::string_view{message_buf,
+                          static_cast<std::basic_string_view<char>::size_type>(message_size)};
 }
 
 RowGroup const& aggregate_reader_metadata::get_row_group(size_type row_group_index,
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 09f65f9c388..398812945e2 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -121,9 +121,15 @@ struct metadata : public FileMetaData {
   void sanitize_schema();
 };
 
+struct arrow_schema_data_types {
+  std::vector<arrow_schema_data_types> children;
+  data_type type{type_id::EMPTY};
+};
+
 class aggregate_reader_metadata {
   std::vector<metadata> per_file_metadata;
   std::vector<std::unordered_map<std::string, std::string>> keyval_maps;
+
   int64_t num_rows;
   size_type num_row_groups;
 
@@ -139,6 +145,25 @@ class aggregate_reader_metadata {
   [[nodiscard]] std::vector<std::unordered_map<std::string, std::string>> collect_keyval_metadata()
     const;
 
+  /**
+   * @brief Decodes and constructs the arrow schema from the "ARROW:schema" IPC message
+   * in key value metadata section of Parquet file footer
+   */
+  [[nodiscard]] arrow_schema_data_types collect_arrow_schema() const;
+
+  /**
+   * @brief Co-walks the collected arrow and Parquet schema, updates
+   * dtypes and destroys the no longer needed arrow schema object(s).
+   */
+  void apply_arrow_schema();
+
+  /**
+   * @brief Decode an arrow:IPC message and returns an optional string_view of
+   * its metadata header
+   */
+  [[nodiscard]] std::optional<std::string_view> decode_ipc_message(
+    std::string_view const serialized_message) const;
+
   /**
    * @brief Sums up the number of rows of each source
    */
@@ -158,7 +183,8 @@ class aggregate_reader_metadata {
   void column_info_for_row_group(row_group_info& rg_info, size_type chunk_start_row) const;
 
  public:
-  aggregate_reader_metadata(host_span<std::unique_ptr<datasource> const> sources);
+  aggregate_reader_metadata(host_span<std::unique_ptr<datasource> const> sources,
+                            bool use_arrow_schema);
 
   [[nodiscard]] RowGroup const& get_row_group(size_type row_group_index, size_type src_idx) const;
 
@@ -183,7 +209,6 @@ class aggregate_reader_metadata {
   }
 
   [[nodiscard]] auto const& get_key_value_metadata() const& { return keyval_maps; }
-
   [[nodiscard]] auto&& get_key_value_metadata() && { return std::move(keyval_maps); }
 
   /**
diff --git a/cpp/src/io/utilities/base64_utilities.cpp b/cpp/src/io/utilities/base64_utilities.cpp
new file mode 100644
index 00000000000..856c29599a7
--- /dev/null
+++ b/cpp/src/io/utilities/base64_utilities.cpp
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions of this file are derived from Rene Nyffenegger's codebase at
+ * https://github.com/ReneNyffenegger/cpp-base64, original license text below.
+ */
+
+/*
+ *  base64.cpp and base64.h
+ *
+ *  base64 encoding and decoding with C++.
+ *  More information at
+ *    https://renenyffenegger.ch/notes/development/Base64/Encoding-and-decoding-base-64-with-cpp
+ *
+ *  Version: 2.rc.09 (release candidate)
+ *
+ *  Copyright (C) 2004-2017, 2020-2022 René Nyffenegger
+ *
+ *  This source code is provided 'as-is', without any express or implied
+ *  warranty. In no event will the author be held liable for any damages
+ *  arising from the use of this software.
+ *
+ *  Permission is granted to anyone to use this software for any purpose,
+ *  including commercial applications, and to alter it and redistribute it
+ *  freely, subject to the following restrictions:
+ *
+ *  1. The origin of this source code must not be misrepresented; you must not
+ *     claim that you wrote the original source code. If you use this source code
+ *     in a product, an acknowledgment in the product documentation would be
+ *     appreciated but is not required.
+ *
+ *  2. Altered source versions must be plainly marked as such, and must not be
+ *     misrepresented as being the original source code.
+ *
+ *  3. This notice may not be removed or altered from any source distribution.
+ *
+ *  René Nyffenegger rene.nyffenegger@adp-gmbh.ch
+ */
+
+/**
+ * @file base64_utils.cpp
+ * @brief base64 string encoding/decoding implementation
+ */
+
+// altered: applying clang-format for libcudf on this file.
+
+#include "base64_utilities.hpp"
+
+#include <cudf/detail/utilities/logger.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <algorithm>
+
+// altered: use cudf namespaces
+namespace cudf::io::detail {
+
+static const std::string base64_chars =
+  "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+  "abcdefghijklmnopqrstuvwxyz"
+  "0123456789+/";
+
+static constexpr unsigned char trailing_char = '=';
+
+// Function to encode input string to base64 and return the encoded string
+std::string base64_encode(std::string_view string_to_encode)
+{
+  auto input_length = static_cast<int32_t>(string_to_encode.size());
+
+  // altered: compute number of encoding iterations = floor(multiple of 3)
+  int32_t num_iterations = (input_length / 3);
+  num_iterations += (input_length % 3) ? 1 : 0;
+
+  std::string encoded;
+  size_t encoded_length = (input_length + 2) / 3 * 4;
+  encoded.reserve(encoded_length);
+
+  // altered: modify base64 encoder loop using STL and Thrust.
+  // TODO: Port this loop to thrust cooperative groups if needed for too-wide tables.
+  std::for_each(thrust::make_counting_iterator(0),
+                thrust::make_counting_iterator(num_iterations),
+                [&](auto&& iter) {
+                  auto idx = iter * 3;
+
+                  encoded.push_back(base64_chars[(string_to_encode[idx] & 0xfc) >> 2]);
+                  // increment the index by 1
+                  idx += 1;
+
+                  if (idx < input_length) {
+                    encoded.push_back(base64_chars[((string_to_encode[idx - 1] & 0x03) << 4) +
+                                                   ((string_to_encode[idx] & 0xf0) >> 4)]);
+                    // increment the index by 1
+                    idx += 1;
+
+                    if (idx < input_length) {
+                      encoded.push_back(base64_chars[((string_to_encode[idx - 1] & 0x0f) << 2) +
+                                                     ((string_to_encode[idx] & 0xc0) >> 6)]);
+                      encoded.push_back(base64_chars[string_to_encode[idx] & 0x3f]);
+                    } else {
+                      encoded.push_back(base64_chars[(string_to_encode[idx - 1] & 0x0f) << 2]);
+                      encoded.push_back(trailing_char);
+                    }
+                  } else {
+                    encoded.push_back(base64_chars[(string_to_encode[idx - 1] & 0x03) << 4]);
+                    encoded.push_back(trailing_char);
+                    encoded.push_back(trailing_char);
+                  }
+                });
+
+  return encoded;
+}
+
+// base64 decode function
+std::string base64_decode(std::string_view encoded_string)
+{
+  // altered: there must be at least 2 characters in the base64-encoded string
+  if (encoded_string.size() < 2) {
+    CUDF_LOG_ERROR(
+      "Parquet reader encountered invalid base64-encoded string size."
+      "arrow:schema not processed.");
+    return std::string{};
+  }
+
+  size_t input_length = encoded_string.length();
+  std::string decoded;
+
+  // altered: compute number of decoding iterations = floor (multiple of 4)
+  int32_t num_iterations = (input_length / 4);
+  num_iterations += (input_length % 4) ? 1 : 0;
+
+  //
+  // The approximate length (bytes) of the decoded string might be one or
+  // two bytes smaller, depending on the amount of trailing equal signs
+  // in the encoded string. This approximation is needed to reserve
+  // enough space in the string to be returned.
+  size_t approx_decoded_length = input_length / 4 * 3;
+  decoded.reserve(approx_decoded_length);
+
+  //
+  // Iterate over encoded input string in chunks. The size of all
+  // chunks except the last one is 4 bytes.
+  //
+  // The last chunk might be padded with equal signs or dots
+  // in order to make it 4 bytes in size as well, but this
+  // is not required as per RFC 2045.
+  //
+  // All chunks except the last one produce three output bytes.
+  //
+  // The last chunk produces at least one and up to three bytes.
+  //
+  // altered: modify base64 encoder loop to number of iterations using STL and Thrust.
+  // TODO: Port this loop to thrust cooperative groups if needed for too-wide tables.
+  if (not std::all_of(
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(num_iterations),
+        [&](auto&& iter) {
+          int32_t idx                  = iter * 4;
+          size_t current_char_position = 0;
+          size_t char1_position        = 0;
+          size_t char2_position        = 0;
+
+          // Check for data that is not padded with equal
+          // signs (which is allowed by RFC 2045)
+          if (encoded_string[idx] == '=') { return true; }
+
+          current_char_position = base64_chars.find(encoded_string[idx]);
+          char1_position        = base64_chars.find(encoded_string[idx + 1]);
+          if (current_char_position == std::string::npos or char1_position == std::string::npos) {
+            return false;
+          }
+          // Emit the first output byte that is produced in each chunk:
+          decoded.push_back(static_cast<std::string::value_type>((current_char_position << 2) +
+                                                                 ((char1_position & 0x30) >> 4)));
+
+          // increment the index by 1
+          idx += 1;
+          // check for = padding
+          if (encoded_string[idx] == '=') { return true; }
+
+          // increment the index by 1
+          idx += 1;
+          // check for = padding
+          if (encoded_string[idx] == '=') { return true; }
+
+          char1_position = base64_chars.find(encoded_string[idx - 1]);
+          char2_position = base64_chars.find(encoded_string[idx]);
+          if (char1_position == std::string::npos or char2_position == std::string::npos) {
+            return false;
+          }
+          // Emit a chunk's second byte (which might not be produced in the last
+          // chunk).
+          decoded.push_back(static_cast<std::string::value_type>(((char1_position & 0x0f) << 4) +
+                                                                 ((char2_position & 0x3c) >> 2)));
+
+          // increment the index by 1
+          idx += 1;
+          // check for = padding
+          if (encoded_string[idx] == '=') { return true; }
+
+          char2_position        = base64_chars.find(encoded_string[idx - 1]);
+          current_char_position = base64_chars.find(encoded_string[idx]);
+          if (current_char_position == std::string::npos or char2_position == std::string::npos) {
+            return false;
+          }
+          // Emit a chunk's third byte (which might not be produced in the last
+          // chunk).
+          decoded.push_back(static_cast<std::string::value_type>(((char2_position & 0x03) << 6) +
+                                                                 current_char_position));
+
+          // all good, return true
+          return true;
+        })) {
+    return std::string{};
+  }
+
+  // return the decoded string
+  return decoded;
+}
+
+}  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/base64_utilities.hpp b/cpp/src/io/utilities/base64_utilities.hpp
new file mode 100644
index 00000000000..537d9c96d6b
--- /dev/null
+++ b/cpp/src/io/utilities/base64_utilities.hpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions of this file are derived from Rene Nyffenegger's codebase at
+ * https://github.com/ReneNyffenegger/cpp-base64, original license text below.
+ */
+
+/*
+ *  base64.cpp and base64.h
+ *
+ *  base64 encoding and decoding with C++.
+ *  More information at
+ *    https://renenyffenegger.ch/notes/development/Base64/Encoding-and-decoding-base-64-with-cpp
+ *
+ *  Version: 2.rc.09 (release candidate)
+ *
+ *  Copyright (C) 2004-2017, 2020-2022 René Nyffenegger
+ *
+ *  This source code is provided 'as-is', without any express or implied
+ *  warranty. In no event will the author be held liable for any damages
+ *  arising from the use of this software.
+ *
+ *  Permission is granted to anyone to use this software for any purpose,
+ *  including commercial applications, and to alter it and redistribute it
+ *  freely, subject to the following restrictions:
+ *
+ *  1. The origin of this source code must not be misrepresented; you must not
+ *     claim that you wrote the original source code. If you use this source code
+ *     in a product, an acknowledgment in the product documentation would be
+ *     appreciated but is not required.
+ *
+ *  2. Altered source versions must be plainly marked as such, and must not be
+ *     misrepresented as being the original source code.
+ *
+ *  3. This notice may not be removed or altered from any source distribution.
+ *
+ *  René Nyffenegger rene.nyffenegger@adp-gmbh.ch
+ */
+
+/**
+ * @file base64_utils.cpp
+ * @brief base64 string encoding/decoding utilities
+ */
+
+#pragma once
+
+// altered: applying clang-format for libcudf on this file.
+
+// altered: include required headers
+#include <string>
+
+// altered: use cudf namespaces
+namespace cudf::io::detail {
+
+/**
+ * @brief Encodes input string to base64 and returns it
+ *
+ * @param string_to_encode a view of the string to be encoded in base64
+ * @return the base64-encoded string
+ *
+ */
+std::string base64_encode(std::string_view string_to_encode);
+
+/**
+ * @brief Decodes the input base64-encoded string and returns it
+ *
+ * @param encoded_string a view of the base64-encoded string to be decoded
+ * @return the decoded string
+ *
+ */
+std::string base64_decode(std::string_view encoded_string);
+
+}  // namespace cudf::io::detail
diff --git a/cpp/tests/utilities_tests/io_utilities_tests.cpp b/cpp/tests/utilities_tests/io_utilities_tests.cpp
index 6981ad71f1e..e5a153bf781 100644
--- a/cpp/tests/utilities_tests/io_utilities_tests.cpp
+++ b/cpp/tests/utilities_tests/io_utilities_tests.cpp
@@ -25,6 +25,11 @@
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <src/io/utilities/base64_utilities.hpp>
+
+using cudf::io::detail::base64_decode;
+using cudf::io::detail::base64_encode;
+
 class IoUtilitiesTest : public cudf::test::BaseFixture {};
 
 TEST(IoUtilitiesTest, HostMemoryGetAndSet)
@@ -63,3 +68,114 @@ TEST(IoUtilitiesTest, HostMemoryGetAndSet)
   // reset memory resource back
   cudf::io::set_host_memory_resource(last_mr);
 }
+
+TEST(IoUtilitiesTest, Base64EncodeAndDecode)
+{
+  // a vector of lorem ipsum strings
+  std::vector<std::string> strings = {
+    "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut ",
+    "labore et dolore magna aliqua. Id ornare arcu odio ut sem. Ultrices neque ornare aenean ",
+    "euismod elementum nisi quis. Faucibus pulvinar elementum integer enim. Ut tortor pretium ",
+    "viverra suspendisse potenti nullam ac tortor vitae. Elementum pulvinar etiam non quam lacus. ",
+    "Fermentum odio eu feugiat pretium nibh. Commodo ullamcorper a lacus vestibulum sed arcu. "
+    "Elit ",
+    "ut aliquam purus sit amet luctus venenatis lectus magna. Aliquet enim tortor at auctor urna ",
+    "nunc id cursus metus. Vivamus at augue eget arcu dictum. Ultricies leo integer malesuada "
+    "nunc ",
+    "vel risus commodo viverra maecenas.Netus et malesuada fames ac turpis egestas. Erat ",
+    "pellentesque adipiscing commodo elit at imperdiet. Commodo nulla facilisi nullam vehicula. ",
+    "Morbi tristique senectus et netus et. Cursus vitae congue mauris rhoncus aenean vel elit ",
+    "scelerisque mauris. Eros donec ac odio tempor orci dapibus ultrices. Purus in mollis nunc "
+    "sed ",
+    "id. Justo eget magna fermentum iaculis eu. Diam maecenas ultricies mi eget. Justo laoreet "
+    "sit ",
+    "amet cursus sit amet. Nibh venenatis cras sed felis eget velit aliquet sagittis id. Dui ut ",
+    "ornare lectus sit amet est placerat in egestas. Malesuada nunc vel risus commodo viverra ",
+    "maecenas accumsan lacus. Arcu non odio euismod lacinia at. Euismod elementum nisi quis ",
+    "eleifend quam adipiscing vitae proin sagittis. Eget sit amet tellus cras adipiscing enim ",
+    "eu.Neque ornare aenean euismod elementum nisi quis eleifend quam adipiscing. Posuere ",
+    "sollicitudin aliquam ultrices sagittis orci a scelerisque purus. Lobortis elementum nibh ",
+    "tellus molestie. Et ligula ullamcorper malesuada proin libero nunc consequat interdum "
+    "varius. ",
+    "Neque volutpat ac tincidunt vitae semper quis lectus. Nunc mi ipsum faucibus vitae. Congue "
+    "eu ",
+    "consequat ac felis donec et. Faucibus in ornare quam viverra orci sagittis. Egestas "
+    "fringilla ",
+    "phasellus faucibus scelerisque eleifend. Sem fringilla ut morbi tincidunt augue. Lobortis ",
+    "elementum nibh tellus molestie nunc non. Ultrices neque ornare aenean euismod elementum. ",
+    "Cursus turpis massa tincidunt dui ut ornare lectus sit. Eu facilisis sed odio morbi quis "
+    "commodo odio. Tortor dignissim convallis aenean et tortor at risus. Sed euismod nisi porta ",
+    "lorem. In ornare quam viverra orci sagittis. Sed blandit libero volutpat sed cras. Quis ",
+    "viverra nibh cras pulvinar mattis nunc sed blandit libero. Non tellus orci ac auctor augue. ",
+    "Mattis molestie a iaculis at erat pellentesque adipiscing. Est lorem ipsum dolor sit amet ",
+    "consectetur. Commodo odio aenean sed adipiscing. Nunc lobortis mattis aliquam faucibus "
+    "purus. ",
+    "Pellentesque massa placerat duis ultricies lacus. Sed viverra tellus in hac habitasse "
+    "platea. ",
+    "Ut porttitor leo a diam sollicitudin tempor id eu. Rhoncus aenean vel elit scelerisque "
+    "mauris ",
+    "pellentesque pulvinar pellentesque. Ornare quam viverra orci sagittis. Interdum consectetur ",
+    "libero id faucibus nisl tincidunt eget. Eget est lorem ipsum dolor sit amet. Malesuada fames ",
+    "ac turpis egestas integer eget aliquet nibh. Scelerisque felis imperdiet proin fermentum "
+    "leo. ",
+    "Duis convallis convallis tellus id interdum velit. Sit amet massa vitae tortor condimentum ",
+    "lacinia quis vel. Eu turpis egestas pretium aenean pharetra. Sed enim ut sem viverra aliquet ",
+    "eget sit amet tellus. Feugiat nisl pretium fusce id velit ut tortor. In hendrerit gravida ",
+    "rutrum quisque non tellus orci ac auctor. Sit amet nulla facilisi morbi. Nunc congue nisi ",
+    "vitae suscipit tellus. Posuere morbi leo urna molestie at elementum eu. Egestas sed tempus ",
+    "urna et pharetra pharetra. Sed euismod nisi porta lorem. At elementum eu facilisis sed. Odio ",
+    "aenean sed adipiscing diam donec. Congue nisi vitae suscipit tellus mauris a diam. Fringilla ",
+    "urna porttitor rhoncus dolor purus non enim praesent. Eget gravida cum sociis natoque. ",
+    "Facilisis mauris sit amet massa vitae tortor. Vulputate odio ut enim blandit volutpat ",
+    "maecenas volutpat blandit. Ut ornare lectus sit amet est placerat in. Quis vel eros donec ac ",
+    "odio tempor orci dapibus ultrices. Venenatis lectus magna fringilla urna porttitor rhoncus ",
+    "dolor. Mattis vulputate enim nulla aliquet porttitor lacus. Lectus nulla at volutpat diam ut ",
+    "venenatis tellus in. Et ligula ullamcorper malesuada proin libero nunc consequat interdum. "
+    "Ut ",
+    "enim blandit volutpat maecenas volutpat blandit aliquam etiam erat. Pellentesque pulvinar ",
+    "pellentesque habitant morbi tristique senectus et. Auctor eu augue ut lectus arcu bibendum "
+    "at ",
+    "varius. Posuere ac ut consequat semper viverra nam. Sed euismod nisi porta lorem mollis ",
+    "aliquam ut. Porttitor eget dolor morbi non arcu risus quis varius. Adipiscing bibendum est ",
+    "ultricies integer quis auctor. Hac habitasse platea dictumst quisque sagittis purus sit amet ",
+    "volutpat. Nullam vehicula ipsum a arcu cursus vitae. Velit scelerisque in dictum non ",
+    "consectetur a erat nam at. Nulla facilisi cras fermentum odio eu. Tincidunt augue interdum ",
+    "velit euismod in pellentesque massa placerat. Suspendisse potenti nullam ac tortor vitae ",
+    "purus faucibus ornare. Amet dictum sit amet justo donec enim diam vulputate. Tellus ",
+    "pellentesque eu tincidunt tortor aliquam nulla facilisi cras. Mauris in aliquam sem "
+    "fringilla ",
+    "ut morbi tincidunt. Volutpat diam ut venenatis tellus in metus. Sed pulvinar proin gravida ",
+    "hendrerit lectus a. Feugiat nisl pretium fusce id velit ut tortor pretium viverra. Non ",
+    "consectetur a erat nam. Fermentum odio eu feugiat pretium nibh ipsum consequat nisl. Donec ",
+    "pretium vulputate sapien nec. Purus sit amet luctus venenatis lectus magna fringilla. Mauris ",
+    "cursus mattis molestie a iaculis. A iaculis at erat pellentesque adipiscing. Auctor augue ",
+    "mauris augue neque gravida in fermentum et sollicitudin. Lectus quam id leo in vitae turpis ",
+    "massa sed. Erat nam at lectus urna duis convallis convallis. Dignissim cras tincidunt ",
+    "lobortis feugiat vivamus at augue eget arcu. Eleifend mi in nulla posuere sollicitudin ",
+    "aliquam ultrices sagittis. Pellentesque nec nam aliquam sem. Feugiat in fermentum posuere ",
+    "urna nec tincidunt praesent. Morbi non arcu risus quis varius quam quisque. Morbi tristique ",
+    "senectus et netus et malesuada fames ac. Et ligula ullamcorper malesuada proin libero. ",
+    "Vivamus at augue eget arcu dictum varius duis at consectetur. Eget mauris pharetra et ",
+    "ultrices neque ornare aenean euismod. Sapien faucibus et molestie ac feugiat sed lectus ",
+    "vestibulum mattis. Blandit turpis cursus in hac habitasse platea dictumst quisque sagittis. ",
+    "Fermentum iaculis eu non diam phasellus vestibulum. Mattis aliquam faucibus purus in massa ",
+    "tempor nec feugiat nisl. Lectus sit amet est placerat. Accumsan sit amet nulla facilisi "
+    "morbi ",
+    "tempus iaculis urna. Magna eget est lorem ipsum dolor sit. Curabitur gravida arcu ac tortor ",
+    "dignissim convallis aenean."};
+
+  std::vector<std::string> base64_roundtripped_strings;
+
+  std::transform(strings.begin(),
+                 strings.end(),
+                 std::back_inserter(base64_roundtripped_strings),
+                 [&](auto& str) { return base64_decode(base64_encode(str)); });
+
+  // Create columns for expected and results
+  cudf::test::strings_column_wrapper expected(strings.begin(), strings.end());
+  cudf::test::strings_column_wrapper results(base64_roundtripped_strings.begin(),
+                                             base64_roundtripped_strings.end());
+
+  // Check equal columns
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, results);
+}
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 4a23a58b523..70acb7f917b 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -170,6 +170,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
         parquet_reader_options.builder(source)
         .row_groups(cpp_row_groups)
         .use_pandas_metadata(cpp_use_pandas_metadata)
+        .use_arrow_schema(True)
         .timestamp_type(cpp_timestamp_type)
     )
     if filters is not None:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
index b7f3f89f71c..33a594b432f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
@@ -23,11 +23,13 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         const optional[reference_wrapper[expression]]& get_filter() except +
         data_type get_timestamp_type() except +
         bool is_enabled_use_pandas_metadata() except +
+        bool is_enabled_arrow_schema() except +
 
         # setter
 
         void set_columns(vector[string] col_names) except +
         void set_row_groups(vector[vector[size_type]] row_grp) except +
+        void enable_use_arrow_schema(bool val) except +
         void enable_use_pandas_metadata(bool val) except +
         void set_timestamp_type(data_type type) except +
 
@@ -50,6 +52,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_reader_options_builder& use_pandas_metadata(
             bool val
         ) except +
+        parquet_reader_options_builder& use_arrow_schema(
+            bool val
+        ) except +
         parquet_reader_options_builder& timestamp_type(
             data_type type
         ) except +
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 3680c1e0c62..b2896d55b80 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -3243,3 +3243,91 @@ def test_parquet_reader_zstd_huff_tables(datadir):
     expected = pa.parquet.read_table(fname).to_pandas()
     actual = cudf.read_parquet(fname)
     assert_eq(actual, expected)
+
+
+def test_parquet_reader_roundtrip_with_arrow_schema():
+    # Ensure that the nested types are faithfully being roundtripped
+    # across Parquet with arrow schema which is used to faithfully
+    # round trip duration types (timedelta64) across Parquet read and write.
+    pdf = pd.DataFrame(
+        {
+            "s": pd.Series([None, None, None], dtype="timedelta64[s]"),
+            "ms": pd.Series([1234, None, 32442], dtype="timedelta64[ms]"),
+            "us": pd.Series([None, 3456, None], dtype="timedelta64[us]"),
+            "ns": pd.Series([1234, 3456, 32442], dtype="timedelta64[ns]"),
+            "duration_list": list(
+                [
+                    [
+                        datetime.timedelta(minutes=7, seconds=4),
+                        datetime.timedelta(minutes=7),
+                    ],
+                    [
+                        None,
+                        None,
+                    ],
+                    [
+                        datetime.timedelta(minutes=7, seconds=4),
+                        None,
+                    ],
+                ]
+            ),
+            "int64": pd.Series([1234, 123, 4123], dtype="int64"),
+            "list": list([[1, 2], [1, 2], [1, 2]]),
+            "datetime": pd.Series([1234, 123, 4123], dtype="datetime64[ms]"),
+            "map": pd.Series(["cat", "dog", "lion"]).map(
+                {"cat": "kitten", "dog": "puppy", "lion": "cub"}
+            ),
+        }
+    )
+
+    # Write parquet with arrow for now (to write arrow:schema)
+    buffer = BytesIO()
+    pdf.to_parquet(buffer, engine="pyarrow")
+
+    # Read parquet with arrow schema
+    got = cudf.read_parquet(buffer)
+    # Convert to cudf table for an apple to apple comparison
+    expected = cudf.from_pandas(pdf)
+
+    # Check results for reader with schema
+    assert_eq(expected, got)
+
+
+def test_parquet_reader_roundtrip_structs_with_arrow_schema():
+    # Ensure that the structs with duration types are faithfully being
+    # roundtripped across Parquet with arrow schema
+    pdf = pd.DataFrame(
+        {
+            "struct": {
+                "payload": {
+                    "Domain": {
+                        "Name": "abc",
+                        "Id": {"Name": "host", "Value": "127.0.0.8"},
+                        "Duration": datetime.timedelta(minutes=12),
+                    },
+                    "StreamId": "12345678",
+                    "Duration": datetime.timedelta(minutes=4),
+                    "Offset": None,
+                    "Resource": [
+                        {
+                            "Name": "ZoneName",
+                            "Value": "RAPIDS",
+                            "Duration": datetime.timedelta(seconds=1),
+                        }
+                    ],
+                }
+            }
+        }
+    )
+
+    # Reset the buffer and write parquet with arrow
+    buffer = BytesIO()
+    pdf.to_parquet(buffer, engine="pyarrow")
+
+    # Read parquet with arrow schema
+    got = cudf.read_parquet(buffer)
+    # Convert to cudf table for an apple to apple comparison
+    expected = cudf.from_pandas(pdf)
+
+    # Check results
+    assert_eq(expected, got)

From 08115239ad1f5155108430e0d0ac2f747f4bbd59 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Wed, 15 May 2024 11:29:22 -0500
Subject: [PATCH 225/272] Avoid running sanitizer on Java test designed to
 cause an error (#15753)

Fixes NVIDIA/spark-rapids-jni#2039.  CudaTest#testCudaException causes the compute-sanitizer to fail the test because it (correctly) flags an invalid argument being passed to a CUDA runtime call.  Updated the tagging for the test to avoid running it under the compute-sanitizer.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Gera Shegalov (https://github.com/gerashegalov)

URL: https://github.com/rapidsai/cudf/pull/15753
---
 java/src/test/java/ai/rapids/cudf/CudaTest.java | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/java/src/test/java/ai/rapids/cudf/CudaTest.java b/java/src/test/java/ai/rapids/cudf/CudaTest.java
index 9aaa9cee916..a741b0a5e31 100644
--- a/java/src/test/java/ai/rapids/cudf/CudaTest.java
+++ b/java/src/test/java/ai/rapids/cudf/CudaTest.java
@@ -16,6 +16,7 @@
 
 package ai.rapids.cudf;
 
+import org.junit.jupiter.api.Tag;
 import org.junit.jupiter.api.Test;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -33,13 +34,14 @@ public void testGetCudaRuntimeInfo() {
     assertEquals(Cuda.getNativeComputeMode(), Cuda.getComputeMode().nativeId);
   }
 
+  @Tag("noSanitizer")
   @Test
   public void testCudaException() {
     assertThrows(CudaException.class, () -> {
           try {
             Cuda.freePinned(-1L);
           } catch (CudaFatalException fatalEx) {
-            throw new AssertionError("Expected UnFatalError but got FatalError: " + fatalEx);
+            throw new AssertionError("Expected CudaException but got fatal error", fatalEx);
           } catch (CudaException ex) {
             assertEquals(CudaException.CudaError.cudaErrorInvalidValue, ex.getCudaError());
             throw ex;

From 92b2b1231bd16dec8cf50b7ea23fd8955431337d Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 15 May 2024 17:58:43 +0100
Subject: [PATCH 226/272] Implement null-aware NOT_EQUALS binop (#15731)

Fill out the table of null-aware comparison binops by also supporting a new NULL_NOT_EQUALS. This is the negation of NULL_EQUALS but implemented in a single pass, rather than binop(NULL_EQUALS) followed by uop(NEGATE).

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/15731
---
 cpp/CMakeLists.txt                            |  1 +
 cpp/benchmarks/binaryop/compiled_binaryop.cpp |  3 +-
 cpp/include/cudf/binaryop.hpp                 |  2 ++
 cpp/src/binaryop/binaryop.cpp                 |  9 ++---
 cpp/src/binaryop/compiled/NullNotEquals.cu    | 26 ++++++++++++++
 cpp/src/binaryop/compiled/binary_ops.cu       |  4 ++-
 cpp/src/binaryop/compiled/binary_ops.cuh      |  1 +
 cpp/src/binaryop/compiled/binary_ops.hpp      |  2 +-
 cpp/src/binaryop/compiled/operation.cuh       | 17 ++++++++--
 .../binaryop/compiled/struct_binary_ops.cuh   |  4 +--
 cpp/src/binaryop/compiled/util.cpp            |  4 ++-
 cpp/tests/binaryop/binop-compiled-test.cpp    | 34 +++++++++++++++++++
 cpp/tests/binaryop/util/operation.h           |  8 +++++
 .../main/java/ai/rapids/cudf/BinaryOp.java    |  9 ++---
 .../java/ai/rapids/cudf/BinaryOperable.java   | 14 ++++++++
 python/cudf/cudf/_lib/binaryop.pyx            |  2 +-
 .../cudf/_lib/pylibcudf/libcudf/binaryop.pxd  |  1 +
 python/cudf/cudf/core/column/categorical.py   |  7 +++-
 python/cudf/cudf/core/column/datetime.py      |  8 +++--
 python/cudf/cudf/core/column/numerical.py     |  1 +
 python/cudf/cudf/core/column/string.py        |  1 +
 python/cudf/cudf/core/column/timedelta.py     | 10 ++++--
 22 files changed, 143 insertions(+), 25 deletions(-)
 create mode 100644 cpp/src/binaryop/compiled/NullNotEquals.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 474269364de..7390c465ccb 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -264,6 +264,7 @@ add_library(
   src/binaryop/compiled/Mod.cu
   src/binaryop/compiled/Mul.cu
   src/binaryop/compiled/NullEquals.cu
+  src/binaryop/compiled/NullNotEquals.cu
   src/binaryop/compiled/NullLogicalAnd.cu
   src/binaryop/compiled/NullLogicalOr.cu
   src/binaryop/compiled/NullMax.cu
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
index a1131df4472..7086a61c7c5 100644
--- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp
+++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -111,5 +111,6 @@ BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NOT_EQUAL,            bool
 BINARYOP_BENCHMARK_DEFINE(timestamp_s,  timestamp_s,  LESS,                 bool);
 BINARYOP_BENCHMARK_DEFINE(timestamp_ms, timestamp_s,  GREATER,              bool);
 BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  NULL_EQUALS,          bool);
+BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  NULL_NOT_EQUALS,      bool);
 BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NULL_MAX,             decimal32);
 BINARYOP_BENCHMARK_DEFINE(timestamp_D,  timestamp_s,  NULL_MIN,             timestamp_s);
diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index 20550e92f9f..5e41a871f32 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -77,6 +77,8 @@ enum class binary_operator : int32_t {
   GREATER_EQUAL,         ///< operator >=
   NULL_EQUALS,           ///< Returns true when both operands are null; false when one is null; the
                          ///< result of equality when both are non-null
+  NULL_NOT_EQUALS,       ///< Returns false when both operands are null; true when one is null; the
+                         ///< result of inequality when both are non-null
   NULL_MAX,              ///< Returns max of operands when both are non-null; returns the non-null
                          ///< operand when one is null; or invalid when both are null
   NULL_MIN,              ///< Returns min of operands when both are non-null; returns the non-null
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index e39a2bb3ae8..ac31f9045fe 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -77,9 +77,9 @@ std::pair<rmm::device_buffer, size_type> scalar_col_valid_mask_and(
  */
 inline bool is_null_dependent(binary_operator op)
 {
-  return op == binary_operator::NULL_EQUALS || op == binary_operator::NULL_MIN ||
-         op == binary_operator::NULL_MAX || op == binary_operator::NULL_LOGICAL_AND ||
-         op == binary_operator::NULL_LOGICAL_OR;
+  return op == binary_operator::NULL_EQUALS || op == binary_operator::NULL_NOT_EQUALS ||
+         op == binary_operator::NULL_MIN || op == binary_operator::NULL_MAX ||
+         op == binary_operator::NULL_LOGICAL_AND || op == binary_operator::NULL_LOGICAL_OR;
 }
 
 /**
@@ -109,7 +109,8 @@ bool is_comparison_binop(binary_operator op)
          op == binary_operator::GREATER or        // operator >
          op == binary_operator::LESS_EQUAL or     // operator <=
          op == binary_operator::GREATER_EQUAL or  // operator >=
-         op == binary_operator::NULL_EQUALS;      // 2 null = true; 1 null = false; else ==
+         op == binary_operator::NULL_EQUALS or    // 2 null = true; 1 null = false; else ==
+         op == binary_operator::NULL_NOT_EQUALS;  // 2 null = false; 1 null = true; else !=
 }
 
 /**
diff --git a/cpp/src/binaryop/compiled/NullNotEquals.cu b/cpp/src/binaryop/compiled/NullNotEquals.cu
new file mode 100644
index 00000000000..34f73cca48a
--- /dev/null
+++ b/cpp/src/binaryop/compiled/NullNotEquals.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::NullNotEquals>(mutable_column_view&,
+                                                  column_view const&,
+                                                  column_view const&,
+                                                  bool is_lhs_scalar,
+                                                  bool is_rhs_scalar,
+                                                  rmm::cuda_stream_view);
+}  // namespace cudf::binops::compiled
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index d3257fadb1d..ba0253ec853 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -356,6 +356,7 @@ case binary_operator::LOG_BASE:             apply_binary_op<ops::LogBase>(out, l
 case binary_operator::ATAN2:                apply_binary_op<ops::ATan2>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::PMOD:                 apply_binary_op<ops::PMod>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::NULL_EQUALS:          apply_binary_op<ops::NullEquals>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::NULL_NOT_EQUALS:      apply_binary_op<ops::NullNotEquals>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::NULL_MAX:             apply_binary_op<ops::NullMax>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::NULL_MIN:             apply_binary_op<ops::NullMin>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::NULL_LOGICAL_AND:     apply_binary_op<ops::NullLogicalAnd>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
@@ -412,8 +413,9 @@ void apply_sorting_struct_binary_op(mutable_column_view& out,
   // Struct child column type and structure mismatches are caught within the two_table_comparator
   switch (op) {
     case binary_operator::EQUAL: [[fallthrough]];
+    case binary_operator::NOT_EQUAL: [[fallthrough]];
     case binary_operator::NULL_EQUALS: [[fallthrough]];
-    case binary_operator::NOT_EQUAL:
+    case binary_operator::NULL_NOT_EQUALS:
       detail::apply_struct_equality_op(
         out,
         lhs,
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index 0bc144baa83..5177e7d4bda 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -109,6 +109,7 @@ struct ops_wrapper {
         type_dispatcher(rhs.type(), type_casted_accessor<TypeCommon>{}, i, rhs, is_rhs_scalar);
       auto result = [&]() {
         if constexpr (std::is_same_v<BinaryOperator, ops::NullEquals> or
+                      std::is_same_v<BinaryOperator, ops::NullNotEquals> or
                       std::is_same_v<BinaryOperator, ops::NullLogicalAnd> or
                       std::is_same_v<BinaryOperator, ops::NullLogicalOr> or
                       std::is_same_v<BinaryOperator, ops::NullMax> or
diff --git a/cpp/src/binaryop/compiled/binary_ops.hpp b/cpp/src/binaryop/compiled/binary_ops.hpp
index c7eb08cd133..ceeba9cf817 100644
--- a/cpp/src/binaryop/compiled/binary_ops.hpp
+++ b/cpp/src/binaryop/compiled/binary_ops.hpp
@@ -194,7 +194,7 @@ void apply_binary_op(mutable_column_view& out,
  * @brief Deploys single type or double type dispatcher that runs equality operation on each element
  * of @p lhs and @p rhs columns.
  *
- * Comparison operators are EQUAL, NOT_EQUAL, NULL_EQUALS.
+ * Comparison operators are EQUAL, NOT_EQUAL, NULL_EQUALS, NULL_NOT_EQUALS.
  * @p out type is boolean.
  *
  * This template is instantiated for each binary operator.
diff --git a/cpp/src/binaryop/compiled/operation.cuh b/cpp/src/binaryop/compiled/operation.cuh
index 214803dc415..43b4bd232c4 100644
--- a/cpp/src/binaryop/compiled/operation.cuh
+++ b/cpp/src/binaryop/compiled/operation.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -422,15 +422,26 @@ struct NullEquals {
     TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) -> decltype(x == y)
   {
     output_valid = true;
-    if (!lhs_valid && !rhs_valid) return true;
     if (lhs_valid && rhs_valid) return x == y;
-    return false;
+    return !lhs_valid && !rhs_valid;
   }
   // To allow std::is_invocable_v = true
   template <typename TypeLhs, typename TypeRhs>
   __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x == y);
 };
 
+struct NullNotEquals {
+  template <typename TypeLhs, typename TypeRhs>
+  __device__ inline auto operator()(
+    TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) -> decltype(x != y)
+  {
+    return !NullEquals{}(x, y, lhs_valid, rhs_valid, output_valid);
+  }
+  // To allow std::is_invocable_v = true
+  template <typename TypeLhs, typename TypeRhs>
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x != y);
+};
+
 struct NullMax {
   template <typename TypeLhs,
             typename TypeRhs,
diff --git a/cpp/src/binaryop/compiled/struct_binary_ops.cuh b/cpp/src/binaryop/compiled/struct_binary_ops.cuh
index 2299df5a9bb..a57ff661d67 100644
--- a/cpp/src/binaryop/compiled/struct_binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/struct_binary_ops.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -148,7 +148,7 @@ void apply_struct_equality_op(mutable_column_view& out,
                               rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(op == binary_operator::EQUAL || op == binary_operator::NOT_EQUAL ||
-                 op == binary_operator::NULL_EQUALS,
+                 op == binary_operator::NULL_EQUALS || op == binary_operator::NULL_NOT_EQUALS,
                "Unsupported operator for these types",
                cudf::data_type_error);
 
diff --git a/cpp/src/binaryop/compiled/util.cpp b/cpp/src/binaryop/compiled/util.cpp
index 1ef521d241a..02f4e480ecb 100644
--- a/cpp/src/binaryop/compiled/util.cpp
+++ b/cpp/src/binaryop/compiled/util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -182,6 +182,8 @@ struct is_supported_operation_functor {
       case binary_operator::LESS_EQUAL: return bool_op<ops::LessEqual, TypeLhs, TypeRhs>(out);
       case binary_operator::GREATER_EQUAL: return bool_op<ops::GreaterEqual, TypeLhs, TypeRhs>(out);
       case binary_operator::NULL_EQUALS: return bool_op<ops::NullEquals, TypeLhs, TypeRhs>(out);
+      case binary_operator::NULL_NOT_EQUALS:
+        return bool_op<ops::NullNotEquals, TypeLhs, TypeRhs>(out);
       case binary_operator::NULL_LOGICAL_AND:
         return bool_op<ops::NullLogicalAnd, TypeLhs, TypeRhs>(out);
       case binary_operator::NULL_LOGICAL_OR:
diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
index 27865bd062f..06e0d193d80 100644
--- a/cpp/tests/binaryop/binop-compiled-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -699,6 +699,40 @@ TYPED_TEST(BinaryOperationCompiledTest_NullOps, NullEquals_Vector_Vector)
 using BinaryOperationCompiledTest_NullOpsString =
   BinaryOperationCompiledTest_NullOps<cudf::test::Types<std::string, std::string, std::string>>;
 TEST_F(BinaryOperationCompiledTest_NullOpsString, NullEquals_Vector_Vector)
+{
+  using TypeOut         = bool;
+  using TypeLhs         = std::string;
+  using TypeRhs         = std::string;
+  using NULL_NOT_EQUALS = cudf::library::operation::NullNotEquals<TypeOut, TypeLhs, TypeRhs>;
+
+  auto lhs            = lhs_random_column<TypeLhs>(col_size);
+  auto rhs            = rhs_random_column<TypeRhs>(col_size);
+  auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_NOT_EQUALS>(lhs, rhs);
+
+  auto const result = cudf::binary_operation(
+    lhs, rhs, cudf::binary_operator::NULL_NOT_EQUALS, cudf::data_type(cudf::type_to_id<TypeOut>()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(BinaryOperationCompiledTest_NullOps, NullNotEquals_Vector_Vector)
+{
+  using TypeOut         = bool;
+  using TypeLhs         = typename TestFixture::TypeLhs;
+  using TypeRhs         = typename TestFixture::TypeRhs;
+  using NULL_NOT_EQUALS = cudf::library::operation::NullNotEquals<TypeOut, TypeLhs, TypeRhs>;
+
+  auto lhs            = lhs_random_column<TypeLhs>(col_size);
+  auto rhs            = rhs_random_column<TypeRhs>(col_size);
+  auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_NOT_EQUALS>(lhs, rhs);
+
+  auto const result = cudf::binary_operation(
+    lhs, rhs, cudf::binary_operator::NULL_NOT_EQUALS, cudf::data_type(cudf::type_to_id<TypeOut>()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+using BinaryOperationCompiledTest_NullOpsString =
+  BinaryOperationCompiledTest_NullOps<cudf::test::Types<std::string, std::string, std::string>>;
+TEST_F(BinaryOperationCompiledTest_NullOpsString, NullNotEquals_Vector_Vector)
 {
   using TypeOut     = bool;
   using TypeLhs     = std::string;
diff --git a/cpp/tests/binaryop/util/operation.h b/cpp/tests/binaryop/util/operation.h
index efebc02bc89..c900c4c558c 100644
--- a/cpp/tests/binaryop/util/operation.h
+++ b/cpp/tests/binaryop/util/operation.h
@@ -415,6 +415,14 @@ struct NullEquals {
   }
 };
 
+template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+struct NullNotEquals {
+  TypeOut operator()(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) const
+  {
+    return !NullEquals<TypeOut, TypeLhs, TypeRhs>()(x, y, lhs_valid, rhs_valid, output_valid);
+  }
+};
+
 template <typename TypeOut, typename TypeLhs, typename TypeRhs>
 struct NullMax {
   TypeOut operator()(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) const
diff --git a/java/src/main/java/ai/rapids/cudf/BinaryOp.java b/java/src/main/java/ai/rapids/cudf/BinaryOp.java
index fe559184878..c60323775ce 100644
--- a/java/src/main/java/ai/rapids/cudf/BinaryOp.java
+++ b/java/src/main/java/ai/rapids/cudf/BinaryOp.java
@@ -49,11 +49,12 @@ public enum BinaryOp {
   LESS_EQUAL(25), // <=
   GREATER_EQUAL(26), // >=
   NULL_EQUALS(27), // like EQUAL but NULL == NULL is TRUE and NULL == not NULL is FALSE
-  NULL_MAX(28), // MAX but NULL < not NULL
-  NULL_MIN(29), // MIN but NULL > not NULL
+  NULL_NOT_EQUALS(28), // negation of NULL_EQUALS
+  NULL_MAX(29), // MAX but NULL < not NULL
+  NULL_MIN(30), // MIN but NULL > not NULL
   //NOT IMPLEMENTED YET GENERIC_BINARY(30);
-  NULL_LOGICAL_AND(31),
-  NULL_LOGICAL_OR(32);
+  NULL_LOGICAL_AND(32),
+  NULL_LOGICAL_OR(33);
 
 
   static final EnumSet<BinaryOp> COMPARISON = EnumSet.of(
diff --git a/java/src/main/java/ai/rapids/cudf/BinaryOperable.java b/java/src/main/java/ai/rapids/cudf/BinaryOperable.java
index 48a7861f1a1..6e8d862213e 100644
--- a/java/src/main/java/ai/rapids/cudf/BinaryOperable.java
+++ b/java/src/main/java/ai/rapids/cudf/BinaryOperable.java
@@ -546,6 +546,20 @@ default ColumnVector equalToNullAware(BinaryOperable rhs) {
     return equalToNullAware(rhs, DType.BOOL8);
   }
 
+  /**
+   * like notEqualTo but NULL != NULL is TRUE and NULL != not NULL is FALSE
+   */
+  default ColumnVector notEqualToNullAware(BinaryOperable rhs, DType outType) {
+    return binaryOp(BinaryOp.NULL_NOT_EQUALS, rhs, outType);
+  }
+
+  /**
+   * like notEqualTo but NULL != NULL is TRUE and NULL != not NULL is FALSE
+   */
+  default ColumnVector notEqualToNullAware(BinaryOperable rhs) {
+    return notEqualToNullAware(rhs, DType.BOOL8);
+  }
+
   /**
    * Returns the max non null value.
    */
diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx
index 969be426044..2e352dd7904 100644
--- a/python/cudf/cudf/_lib/binaryop.pyx
+++ b/python/cudf/cudf/_lib/binaryop.pyx
@@ -34,7 +34,7 @@ def binaryop(lhs, rhs, op, dtype):
     """
     # TODO: Shouldn't have to keep special-casing. We need to define a separate
     # pipeline for libcudf binops that don't map to Python binops.
-    if op not in {"INT_POW", "NULL_EQUALS"}:
+    if op not in {"INT_POW", "NULL_EQUALS", "NULL_NOT_EQUALS"}:
         op = op[2:-2]
     op = op.upper()
     op = _op_map.get(op, op)
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
index 788a94a0bbc..0eda7d34ff9 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
@@ -29,6 +29,7 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         LESS_EQUAL
         GREATER_EQUAL
         NULL_EQUALS
+        NULL_NOT_EQUALS
         BITWISE_AND
         BITWISE_OR
         BITWISE_XOR
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 1f003534913..adda8a34cd0 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -729,7 +729,12 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         if not isinstance(other, CategoricalColumn):
             raise ValueError
         # Note: at this stage we are guaranteed that the dtypes are equal.
-        if not self.ordered and op not in {"__eq__", "__ne__", "NULL_EQUALS"}:
+        if not self.ordered and op not in {
+            "__eq__",
+            "__ne__",
+            "NULL_EQUALS",
+            "NULL_NOT_EQUALS",
+        }:
             raise TypeError(
                 "The only binary operations supported by unordered "
                 "categorical columns are equality and inequality."
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 9fe4e5da96d..d92a3a00641 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -570,18 +570,20 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                 out_dtype = _resolve_mixed_dtypes(lhs, rhs, "datetime64")
         elif op in {
             "__eq__",
-            "NULL_EQUALS",
             "__ne__",
+            "NULL_EQUALS",
+            "NULL_NOT_EQUALS",
         }:
             out_dtype = cudf.dtype(np.bool_)
             if isinstance(other, ColumnBase) and not isinstance(
                 other, DatetimeColumn
             ):
+                fill_value = op in ("__ne__", "NULL_NOT_EQUALS")
                 result = _all_bools_with_nulls(
-                    self, other, bool_fill_value=op == "__ne__"
+                    self, other, bool_fill_value=fill_value
                 )
                 if cudf.get_option("mode.pandas_compatible"):
-                    result = result.fillna(op == "__ne__")
+                    result = result.fillna(fill_value)
                 return result
 
         if out_dtype is None:
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index f6c7ca7675a..12c27ed0bc1 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -248,6 +248,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             "__eq__",
             "__ne__",
             "NULL_EQUALS",
+            "NULL_NOT_EQUALS",
         }:
             out_dtype = "bool"
 
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 3e941d60079..40e58e14612 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5957,6 +5957,7 @@ def _binaryop(
                 "__ge__",
                 "__le__",
                 "NULL_EQUALS",
+                "NULL_NOT_EQUALS",
             }:
                 lhs, rhs = (other, self) if reflect else (self, other)
                 return libcudf.binaryop.binaryop(
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index c5ed889b5dc..c6af052b56f 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -163,6 +163,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                 "__le__",
                 "__ge__",
                 "NULL_EQUALS",
+                "NULL_NOT_EQUALS",
             }:
                 out_dtype = cudf.dtype(np.bool_)
             elif op == "__mod__":
@@ -185,15 +186,18 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         elif other.dtype.kind in {"f", "i", "u"}:
             if op in {"__mul__", "__mod__", "__truediv__", "__floordiv__"}:
                 out_dtype = self.dtype
-            elif op in {"__eq__", "NULL_EQUALS", "__ne__"}:
+            elif op in {"__eq__", "__ne__", "NULL_EQUALS", "NULL_NOT_EQUALS"}:
                 if isinstance(other, ColumnBase) and not isinstance(
                     other, TimeDeltaColumn
                 ):
+                    fill_value = op in ("__ne__", "NULL_NOT_EQUALS")
                     result = _all_bools_with_nulls(
-                        self, other, bool_fill_value=op == "__ne__"
+                        self,
+                        other,
+                        bool_fill_value=fill_value,
                     )
                     if cudf.get_option("mode.pandas_compatible"):
-                        result = result.fillna(op == "__ne__")
+                        result = result.fillna(fill_value)
                     return result
 
         if out_dtype is None:

From b5f6aa59cd9d2ebb238f9f249b305d1883169332 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 15 May 2024 07:26:21 -1000
Subject: [PATCH 227/272] Eliminate circular reference in
 DataFrame/Series.iloc/loc (#15749)

closes #15748

The performance implication can be seen in the issue

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15749
---
 python/cudf/cudf/core/indexed_frame.py  |  5 ++---
 python/cudf/cudf/tests/test_indexing.py | 11 +++++++++++
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 904cd0c69c2..8d67afa34bc 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -8,7 +8,6 @@
 import textwrap
 import warnings
 from collections import Counter, abc
-from functools import cached_property
 from typing import (
     Any,
     Callable,
@@ -2266,7 +2265,7 @@ def truncate(self, before=None, after=None, axis=0, copy=True):
         slicer[axis] = slice(before, after)
         return self.loc[tuple(slicer)].copy()
 
-    @cached_property
+    @property
     def loc(self):
         """Select rows and columns by label or boolean mask.
 
@@ -2332,7 +2331,7 @@ def loc(self):
         """
         return self._loc_indexer_type(self)
 
-    @cached_property
+    @property
     def iloc(self):
         """Select values by position.
 
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index f49b9b02076..b1d871b6abc 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
+import weakref
 from datetime import datetime
 from itertools import combinations
 
@@ -2257,6 +2258,16 @@ def test_scalar_loc_row_categoricalindex():
     assert_eq(result, expected)
 
 
+@pytest.mark.parametrize("klass", [cudf.DataFrame, cudf.Series])
+@pytest.mark.parametrize("indexer", ["iloc", "loc"])
+def test_iloc_loc_no_circular_reference(klass, indexer):
+    obj = klass([0])
+    ref = weakref.ref(obj)
+    getattr(obj, indexer)[0]
+    del obj
+    assert ref() is None
+
+
 def test_loc_setitem_empty_dataframe():
     pdf = pd.DataFrame(index=["index_1", "index_2", "index_3"])
     gdf = cudf.from_pandas(pdf)

From 516d0f9033e73d10a473e2ca3fcc891e980450bc Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 15 May 2024 17:49:48 -0500
Subject: [PATCH 228/272] remove unnecessary 'setuptools' host dependency,
 simplify dependencies.yaml (#15736)

Pulls out some changes I noticed while working on #15245.

* removes `host` dependency on `setuptools` for `cudf` and `cudf_kafka`
  - *they don't need it now that they build with `scikit-build-core`*
* consolidates some redundant blocks in `dependencies.yaml`

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15736
---
 conda/recipes/cudf/meta.yaml       | 1 -
 conda/recipes/cudf_kafka/meta.yaml | 1 -
 dependencies.yaml                  | 8 +-------
 3 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 24210830ada..12e29c77a98 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -62,7 +62,6 @@ requirements:
     - python
     - cython >=3.0.3
     - scikit-build-core >=0.7.0
-    - setuptools
     - dlpack >=0.8,<1.0
     - numpy 1.23
     - pyarrow ==16.0.0.*
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index ab41d9e1f15..4d91cf6320c 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -61,7 +61,6 @@ requirements:
     - cudf ={{ version }}
     - libcudf_kafka ={{ version }}
     - scikit-build-core >=0.7.0
-    - setuptools
     {% if cuda_major != "11" %}
     - cuda-cudart-dev
     {% endif %}
diff --git a/dependencies.yaml b/dependencies.yaml
index 898760d1351..4f8f3c16ea1 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -325,9 +325,6 @@ dependencies:
           - matrix: {cuda: "11.*"}
             packages: &build_python_packages_cu11
               - &rmm_cu11 rmm-cu11==24.6.*
-          - {matrix: null, packages: null }
-      - output_types: pyproject
-        matrices:
           - {matrix: null, packages: [*rmm_conda] }
   libarrow_build:
     common:
@@ -578,10 +575,7 @@ dependencies:
               - rmm-cu11==24.6.*
               - cubinlinker-cu11
               - ptxcompiler-cu11
-          - {matrix: null, packages: null}
-      - output_types: pyproject
-        matrices:
-          - {matrix: null, packages: [cubinlinker, ptxcompiler, *rmm_conda] }
+          - {matrix: null, packages: [cubinlinker, ptxcompiler, *rmm_conda]}
   run_cudf_polars:
     common:
       - output_types: [conda, requirements, pyproject]

From ec07927b70c0a98d8c1d070e79a2cb9bf281bf12 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com>
Date: Wed, 15 May 2024 22:03:03 -0500
Subject: [PATCH 229/272] Expose stream parameter in public reduction APIs
 (#15737)

Add stream parameter to public reduction APIs:

- `reduce()`
- `segmented_reduce()`
- `scan()`
- `minmax()`

Reference #13744

Authors:
  - Srinivas Yadav (https://github.com/srinivasyadav18)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/15737
---
 cpp/include/cudf/reduction.hpp              |  12 +++
 cpp/src/reductions/minmax.cu                |   4 +-
 cpp/src/reductions/reductions.cpp           |   7 +-
 cpp/src/reductions/scan/scan.cpp            |   3 +-
 cpp/src/reductions/segmented/reductions.cpp |  22 ++---
 cpp/tests/CMakeLists.txt                    |   1 +
 cpp/tests/streams/reduction_test.cpp        | 102 ++++++++++++++++++++
 7 files changed, 129 insertions(+), 22 deletions(-)
 create mode 100644 cpp/tests/streams/reduction_test.cpp

diff --git a/cpp/include/cudf/reduction.hpp b/cpp/include/cudf/reduction.hpp
index 5adf89d1706..52f39925a2d 100644
--- a/cpp/include/cudf/reduction.hpp
+++ b/cpp/include/cudf/reduction.hpp
@@ -75,6 +75,7 @@ enum class scan_type : bool { INCLUSIVE, EXCLUSIVE };
  * @param col Input column view
  * @param agg Aggregation operator applied by the reduction
  * @param output_dtype The output scalar type
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @returns Output scalar with reduce result
  */
@@ -82,6 +83,7 @@ std::unique_ptr<scalar> reduce(
   column_view const& col,
   reduce_aggregation const& agg,
   data_type output_dtype,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -96,6 +98,7 @@ std::unique_ptr<scalar> reduce(
  * @param agg Aggregation operator applied by the reduction
  * @param output_dtype The output scalar type
  * @param init The initial value of the reduction
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @returns Output scalar with reduce result
  */
@@ -104,6 +107,7 @@ std::unique_ptr<scalar> reduce(
   reduce_aggregation const& agg,
   data_type output_dtype,
   std::optional<std::reference_wrapper<scalar const>> init,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -145,6 +149,7 @@ std::unique_ptr<scalar> reduce(
  * @param null_handling If `INCLUDE`, the reduction is valid if all elements in a segment are valid,
  * otherwise null. If `EXCLUDE`, the reduction is valid if any element in the segment is valid,
  * otherwise null.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @returns Output column with results of segmented reduction
  */
@@ -154,6 +159,7 @@ std::unique_ptr<column> segmented_reduce(
   segmented_reduce_aggregation const& agg,
   data_type output_dtype,
   null_policy null_handling,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -169,6 +175,7 @@ std::unique_ptr<column> segmented_reduce(
  * otherwise null. If `EXCLUDE`, the reduction is valid if any element in the segment is valid,
  * otherwise null.
  * @param init The initial value of the reduction
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @returns Output column with results of segmented reduction.
  */
@@ -179,6 +186,7 @@ std::unique_ptr<column> segmented_reduce(
   data_type output_dtype,
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -195,6 +203,7 @@ std::unique_ptr<column> segmented_reduce(
  * exclusive scan if scan_type::EXCLUSIVE.
  * @param[in] null_handling Exclude null values when computing the result if null_policy::EXCLUDE.
  * Include nulls if null_policy::INCLUDE. Any operation with a null results in a null.
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned scalar's device memory
  * @returns Scanned output column
  */
@@ -203,6 +212,7 @@ std::unique_ptr<column> scan(
   scan_aggregation const& agg,
   scan_type inclusive,
   null_policy null_handling         = null_policy::EXCLUDE,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -210,12 +220,14 @@ std::unique_ptr<column> scan(
  *
  *
  * @param col column to compute minmax
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A std::pair of scalars with the first scalar being the minimum value and the second
  * scalar being the maximum value of the input column.
  */
 std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
   column_view const& col,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index 62a1f4aab7c..2c1181972c5 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -275,10 +275,10 @@ std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
 }  // namespace detail
 
 std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
-  column_view const& col, rmm::device_async_resource_ref mr)
+  column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::minmax(col, cudf::get_default_stream(), mr);
+  return detail::minmax(col, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index cde0274339a..8fa036a0949 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -208,20 +208,21 @@ std::unique_ptr<scalar> reduce(column_view const& col,
 std::unique_ptr<scalar> reduce(column_view const& col,
                                reduce_aggregation const& agg,
                                data_type output_dtype,
+                               rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return reduction::detail::reduce(
-    col, agg, output_dtype, std::nullopt, cudf::get_default_stream(), mr);
+  return reduction::detail::reduce(col, agg, output_dtype, std::nullopt, stream, mr);
 }
 
 std::unique_ptr<scalar> reduce(column_view const& col,
                                reduce_aggregation const& agg,
                                data_type output_dtype,
                                std::optional<std::reference_wrapper<scalar const>> init,
+                               rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return reduction::detail::reduce(col, agg, output_dtype, init, cudf::get_default_stream(), mr);
+  return reduction::detail::reduce(col, agg, output_dtype, init, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp
index b6e8690a6c9..de4dcf1de52 100644
--- a/cpp/src/reductions/scan/scan.cpp
+++ b/cpp/src/reductions/scan/scan.cpp
@@ -60,10 +60,11 @@ std::unique_ptr<column> scan(column_view const& input,
                              scan_aggregation const& agg,
                              scan_type inclusive,
                              null_policy null_handling,
+                             rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::scan(input, agg, inclusive, null_handling, cudf::get_default_stream(), mr);
+  return detail::scan(input, agg, inclusive, null_handling, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index 1ae344dcace..48ab5963a29 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -138,17 +138,12 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                                          segmented_reduce_aggregation const& agg,
                                          data_type output_dtype,
                                          null_policy null_handling,
+                                         rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return reduction::detail::segmented_reduce(segmented_values,
-                                             offsets,
-                                             agg,
-                                             output_dtype,
-                                             null_handling,
-                                             std::nullopt,
-                                             cudf::get_default_stream(),
-                                             mr);
+  return reduction::detail::segmented_reduce(
+    segmented_values, offsets, agg, output_dtype, null_handling, std::nullopt, stream, mr);
 }
 
 std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
@@ -157,17 +152,12 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                                          data_type output_dtype,
                                          null_policy null_handling,
                                          std::optional<std::reference_wrapper<scalar const>> init,
+                                         rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return reduction::detail::segmented_reduce(segmented_values,
-                                             offsets,
-                                             agg,
-                                             output_dtype,
-                                             null_handling,
-                                             init,
-                                             cudf::get_default_stream(),
-                                             mr);
+  return reduction::detail::segmented_reduce(
+    segmented_values, offsets, agg, output_dtype, null_handling, init, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index e779e1d1410..c2982c478cd 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -695,6 +695,7 @@ ConfigureTest(STREAM_POOL_TEST streams/pool_test.cu STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_REDUCTION_TEST streams/reduction_test.cpp STREAM_MODE testing)
 ConfigureTest(
   STREAM_STRINGS_TEST
   streams/strings/case_test.cpp
diff --git a/cpp/tests/streams/reduction_test.cpp b/cpp/tests/streams/reduction_test.cpp
new file mode 100644
index 00000000000..53dd1eed459
--- /dev/null
+++ b/cpp/tests/streams/reduction_test.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+
+class ReductionTest : public cudf::test::BaseFixture {};
+
+TEST_F(ReductionTest, ReductionSum)
+{
+  cudf::test::fixed_width_column_wrapper<int> input({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  cudf::reduce(input,
+               *cudf::make_sum_aggregation<cudf::reduce_aggregation>(),
+               cudf::data_type(cudf::type_id::INT32),
+               cudf::test::get_default_stream());
+}
+
+TEST_F(ReductionTest, ReductionSumScalarInit)
+{
+  cudf::test::fixed_width_column_wrapper<int> input({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  auto const init_scalar = cudf::make_fixed_width_scalar<int>(3, cudf::test::get_default_stream());
+  cudf::reduce(input,
+               *cudf::make_sum_aggregation<cudf::reduce_aggregation>(),
+               cudf::data_type(cudf::type_id::INT32),
+               *init_scalar,
+               cudf::test::get_default_stream());
+}
+
+TEST_F(ReductionTest, SegmentedReductionSum)
+{
+  auto const input     = cudf::test::fixed_width_column_wrapper<int>{{1, 2, 3, 1, 0, 3, 1, 0, 0, 0},
+                                                                     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+
+  auto res =
+    cudf::segmented_reduce(input,
+                           d_offsets,
+                           *cudf::make_sum_aggregation<cudf::segmented_reduce_aggregation>(),
+                           cudf::data_type(cudf::type_id::INT32),
+                           cudf::null_policy::EXCLUDE,
+                           cudf::test::get_default_stream());
+}
+
+TEST_F(ReductionTest, SegmentedReductionSumScalarInit)
+{
+  auto const input     = cudf::test::fixed_width_column_wrapper<int>{{1, 2, 3, 1, 0, 3, 1, 0, 0, 0},
+                                                                     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const init_scalar = cudf::make_fixed_width_scalar<int>(3, cudf::test::get_default_stream());
+  auto res =
+    cudf::segmented_reduce(input,
+                           d_offsets,
+                           *cudf::make_sum_aggregation<cudf::segmented_reduce_aggregation>(),
+                           cudf::data_type(cudf::type_id::INT32),
+                           cudf::null_policy::EXCLUDE,
+                           *init_scalar,
+                           cudf::test::get_default_stream());
+}
+
+TEST_F(ReductionTest, ScanMin)
+{
+  auto const input = cudf::test::fixed_width_column_wrapper<int>{
+    {123, 64, 63, 99, -5, 123, -16, -120, -111}, {1, 0, 1, 1, 1, 1, 0, 0, 1}};
+
+  cudf::scan(input,
+             *cudf::make_min_aggregation<cudf::scan_aggregation>(),
+             cudf::scan_type::INCLUSIVE,
+             cudf::null_policy::EXCLUDE,
+             cudf::test::get_default_stream());
+}
+
+TEST_F(ReductionTest, MinMax)
+{
+  auto const input = cudf::test::fixed_width_column_wrapper<int>{
+    {123, 64, 63, 99, -5, 123, -16, -120, -111}, {1, 0, 1, 1, 1, 1, 0, 0, 1}};
+
+  cudf::minmax(input, cudf::test::get_default_stream());
+}

From 4e87069bd43ee969797265eaed00f82eda255dd4 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Wed, 15 May 2024 22:14:30 -0500
Subject: [PATCH 230/272] Cap the absolute row index per pass in parquet
 chunked reader. (#15735)

Fixes  https://github.com/rapidsai/cudf/issues/15690

There was an issue when computing page row counts/indices at the pass level in the chunked reader.  Because we estimate list row counts for pages we have not yet decompressed, this can sometimes lead to estimates row counts that are larger than the actual (known) number of rows for a pass.  This caused an out-of-bounds read down the line.  We were already handling this at the subpass level, just not at the pass level.

Also includes some fixes in debug logging code that is #ifdef'd out.

Authors:
  - https://github.com/nvdbaranec
  - David Wendt (https://github.com/davidwendt)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15735
---
 cpp/src/io/parquet/reader_impl_chunking.cu | 31 +++++++++++++---------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 912f53a8277..f4fb6bc57e6 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -101,7 +101,7 @@ void print_cumulative_page_info(device_span<PageInfo const> d_pages,
       printf("\tP %s: {%lu, %lu, %lu}\n",
              is_list ? "(L)" : "",
              pidx,
-             c_info[pidx].row_index,
+             c_info[pidx].end_row_index,
              c_info[pidx].size_bytes);
     }
   }
@@ -121,16 +121,17 @@ void print_cumulative_row_info(host_span<cumulative_page_info const> sizes,
   printf("------------\nCumulative sizes %s (index, row_index, size_bytes, page_key)\n",
          label.c_str());
   for (size_t idx = 0; idx < sizes.size(); idx++) {
-    printf("{%lu, %lu, %lu, %d}", idx, sizes[idx].row_index, sizes[idx].size_bytes, sizes[idx].key);
+    printf(
+      "{%lu, %lu, %lu, %d}", idx, sizes[idx].end_row_index, sizes[idx].size_bytes, sizes[idx].key);
     if (splits.has_value()) {
       // if we have a split at this row count and this is the last instance of this row count
       auto start             = thrust::make_transform_iterator(splits->begin(),
                                                    [](row_range const& i) { return i.skip_rows; });
       auto end               = start + splits->size();
-      auto split             = std::find(start, end, sizes[idx].row_index);
+      auto split             = std::find(start, end, sizes[idx].end_row_index);
       auto const split_index = [&]() -> int {
-        if (split != end &&
-            ((idx == sizes.size() - 1) || (sizes[idx + 1].row_index > sizes[idx].row_index))) {
+        if (split != end && ((idx == sizes.size() - 1) ||
+                             (sizes[idx + 1].end_row_index > sizes[idx].end_row_index))) {
           return static_cast<int>(std::distance(start, split));
         }
         return idx == 0 ? 0 : -1;
@@ -259,8 +260,9 @@ struct set_row_index {
     auto const& page          = pages[i];
     auto const& chunk         = chunks[page.chunk_idx];
     size_t const page_end_row = chunk.start_row + page.chunk_row + page.num_rows;
-    // if we have been passed in a cap, apply it
-    c_info[i].end_row_index = max_row > 0 ? min(max_row, page_end_row) : page_end_row;
+    // this cap is necessary because in the chunked reader, we use estimations for the row
+    // counts for list columns, which can result in values > than the absolute number of rows.
+    c_info[i].end_row_index = min(max_row, page_end_row);
   }
 };
 
@@ -461,6 +463,7 @@ adjust_cumulative_sizes(device_span<cumulative_page_info const> c_info,
                                                      thrust::make_discard_iterator(),
                                                      key_offsets.begin())
                                  .second;
+
   size_t const num_unique_keys = key_offsets_end - key_offsets.begin();
   thrust::exclusive_scan(
     rmm::exec_policy_nosync(stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin());
@@ -1292,10 +1295,12 @@ void reader::impl::setup_next_pass(bool uses_custom_row_bounds)
     printf("\tnum_rows: %'lu\n", pass.num_rows);
     printf("\tbase mem usage: %'lu\n", pass.base_mem_size);
     auto const num_columns = _input_columns.size();
+    std::vector<size_type> h_page_offsets =
+      cudf::detail::make_std_vector_sync(pass.page_offsets, _stream);
     for (size_t c_idx = 0; c_idx < num_columns; c_idx++) {
       printf("\t\tColumn %'lu: num_pages(%'d)\n",
              c_idx,
-             pass.page_offsets[c_idx + 1] - pass.page_offsets[c_idx]);
+             h_page_offsets[c_idx + 1] - h_page_offsets[c_idx]);
     }
 #endif
 
@@ -1362,11 +1367,12 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
     // can be considerable.
     include_decompression_scratch_size(pass.chunks, pass.pages, c_info, _stream);
 
-    auto iter = thrust::make_counting_iterator(0);
+    auto iter               = thrust::make_counting_iterator(0);
+    auto const pass_max_row = pass.skip_rows + pass.num_rows;
     thrust::for_each(rmm::exec_policy_nosync(_stream),
                      iter,
                      iter + pass.pages.size(),
-                     set_row_index{pass.chunks, pass.pages, c_info, 0});
+                     set_row_index{pass.chunks, pass.pages, c_info, pass_max_row});
     // print_cumulative_page_info(pass.pages, pass.chunks, c_info, _stream);
 
     // get the next batch of pages
@@ -1448,11 +1454,12 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
   printf("\t\tTotal expected usage: %'lu\n",
          total_expected_size == 0 ? subpass.decomp_page_data.size() + pass.base_mem_size
                                   : total_expected_size + pass.base_mem_size);
+  std::vector<page_span> h_page_indices = cudf::detail::make_std_vector_sync(page_indices, _stream);
   for (size_t c_idx = 0; c_idx < num_columns; c_idx++) {
     printf("\t\tColumn %'lu: pages(%'lu - %'lu)\n",
            c_idx,
-           page_indices[c_idx].start,
-           page_indices[c_idx].end);
+           h_page_indices[c_idx].start,
+           h_page_indices[c_idx].end);
   }
   printf("\t\tOutput chunks:\n");
   for (size_t idx = 0; idx < subpass.output_chunk_read_info.size(); idx++) {

From 0a544c2ab3c14e3feff42380518f73778b8b3d7d Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 16 May 2024 10:15:58 +0100
Subject: [PATCH 231/272] Defer to C++ equality and hashing for pylibcudf
 DataType and Aggregation objects (#15732)

Since the C++ layer provides implementations of these, use them, rather than redoing an implementation. This avoids things ever getting out of sync.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15732
---
 python/cudf/cudf/_lib/pylibcudf/aggregation.pyx         | 8 ++++++++
 python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd | 3 +++
 python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd       | 5 +++--
 python/cudf/cudf/_lib/pylibcudf/types.pyx               | 6 +++---
 4 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
index 672b1ba2221..7bb64e32a1b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
@@ -79,6 +79,14 @@ cdef class Aggregation:
             "Aggregations should not be constructed directly. Use one of the factories."
         )
 
+    def __eq__(self, other):
+        return type(self) is type(other) and (
+            dereference(self.c_obj).is_equal(dereference((<Aggregation>other).c_obj))
+        )
+
+    def __hash__(self):
+        return dereference(self.c_obj).do_hash()
+
     # TODO: Ideally we would include the return type here, but we need to do so
     # in a way that Sphinx understands (currently have issues due to
     # https://github.com/cython/cython/issues/5609).
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
index e0e01207589..8c14bc45723 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
@@ -1,4 +1,5 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from libc.stddef cimport size_t
 from libc.stdint cimport int32_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -51,6 +52,8 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
     cdef cppclass aggregation:
         Kind kind
         unique_ptr[aggregation] clone()
+        size_t do_hash() noexcept
+        bool is_equal(const aggregation const) noexcept
 
     cdef cppclass rolling_aggregation(aggregation):
         pass
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd
index 13aebdff726..8e94ec296cf 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd
@@ -88,8 +88,9 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
         data_type(const data_type&) except +
         data_type(type_id id) except +
         data_type(type_id id, int32_t scale) except +
-        type_id id() except +
-        int32_t scale() except +
+        type_id id() noexcept
+        int32_t scale() noexcept
+        bool operator==(const data_type&, const data_type&) noexcept
 
     cpdef enum class interpolation(int32_t):
         LINEAR
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx
index ebe4d66fa20..de10196e289 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx
@@ -47,9 +47,9 @@ cdef class DataType:
         return self.c_obj.scale()
 
     def __eq__(self, other):
-        if not isinstance(other, DataType):
-            return False
-        return self.id() == other.id() and self.scale() == other.scale()
+        return type(self) is type(other) and (
+            self.c_obj == (<DataType>other).c_obj
+        )
 
     @staticmethod
     cdef DataType from_libcudf(data_type dt):

From bdd48f1ce16982f31e01108280d91b5d2a1f8847 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 16 May 2024 06:05:35 -0500
Subject: [PATCH 232/272] Fix `DatetimeIndex.loc` for all types of ordering
 cases (#15761)

Fixes: #15742

This PR resolves issues with returning incorrect ranges for `DatetimeIndex.loc` when the index objects are monotonically decreasing. Additionally, I went ahead and fixed it for all cases, (i.e., random ordering) too.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15761
---
 python/cudf/cudf/core/indexed_frame.py  | 20 +++++--
 python/cudf/cudf/tests/test_indexing.py | 74 +++++++++++++++++++++++++
 2 files changed, 88 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 8d67afa34bc..7aae0d1729e 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -194,7 +194,6 @@ def _get_label_range_or_mask(index, start, stop, step):
     if (
         not (start is None and stop is None)
         and type(index) is cudf.core.index.DatetimeIndex
-        and index.is_monotonic_increasing is False
     ):
         start = pd.to_datetime(start)
         stop = pd.to_datetime(stop)
@@ -205,8 +204,8 @@ def _get_label_range_or_mask(index, start, stop, step):
                 # when we have a non-monotonic datetime index, return
                 # values in the slice defined by index_of(start) and
                 # index_of(end)
-                start_loc = index.get_loc(start.to_datetime64())
-                stop_loc = index.get_loc(stop.to_datetime64()) + 1
+                start_loc = index.get_loc(start)
+                stop_loc = index.get_loc(stop) + 1
                 return slice(start_loc, stop_loc)
             else:
                 raise KeyError(
@@ -214,10 +213,19 @@ def _get_label_range_or_mask(index, start, stop, step):
                     "DatetimeIndexes with non-existing keys is not allowed.",
                 )
         elif start is not None:
-            boolean_mask = index >= start
+            if index.is_monotonic_increasing:
+                return index >= start
+            elif index.is_monotonic_decreasing:
+                return index <= start
+            else:
+                return index.find_label_range(slice(start, stop, step))
         else:
-            boolean_mask = index <= stop
-        return boolean_mask
+            if index.is_monotonic_increasing:
+                return index <= stop
+            elif index.is_monotonic_decreasing:
+                return index >= stop
+            else:
+                return index.find_label_range(slice(start, stop, step))
     else:
         return index.find_label_range(slice(start, stop, step))
 
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index b1d871b6abc..16754c3040b 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -2275,3 +2275,77 @@ def test_loc_setitem_empty_dataframe():
     gdf.loc[["index_1"], "new_col"] = "A"
 
     assert_eq(pdf, gdf)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [15, 14, 12, 10, 1],
+        [1, 10, 12, 14, 15],
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar",
+    [
+        1,
+        10,
+        15,
+        14,
+        0,
+        2,
+    ],
+)
+def test_loc_datetime_monotonic_with_ts(data, scalar):
+    gdf = cudf.DataFrame(
+        {"a": [1, 1, 1, 2, 2], "b": [1, 2, 3, 4, 5]},
+        index=cudf.Index(data, dtype="datetime64[ns]"),
+    )
+    pdf = gdf.to_pandas()
+
+    i = pd.Timestamp(scalar)
+
+    actual = gdf.loc[i:]
+    expected = pdf.loc[i:]
+
+    assert_eq(actual, expected)
+
+    actual = gdf.loc[:i]
+    expected = pdf.loc[:i]
+
+    assert_eq(actual, expected)
+
+
+@pytest.mark.parametrize("data", [[15, 14, 3, 10, 1]])
+@pytest.mark.parametrize("scalar", [1, 10, 15, 14, 0, 2])
+def test_loc_datetime_random_with_ts(data, scalar):
+    gdf = cudf.DataFrame(
+        {"a": [1, 1, 1, 2, 2], "b": [1, 2, 3, 4, 5]},
+        index=cudf.Index(data, dtype="datetime64[ns]"),
+    )
+    pdf = gdf.to_pandas()
+
+    i = pd.Timestamp(scalar)
+
+    if i not in pdf.index:
+        assert_exceptions_equal(
+            lambda: pdf.loc[i:],
+            lambda: gdf.loc[i:],
+            lfunc_args_and_kwargs=([],),
+            rfunc_args_and_kwargs=([],),
+        )
+        assert_exceptions_equal(
+            lambda: pdf.loc[:i],
+            lambda: gdf.loc[:i],
+            lfunc_args_and_kwargs=([],),
+            rfunc_args_and_kwargs=([],),
+        )
+    else:
+        actual = gdf.loc[i:]
+        expected = pdf.loc[i:]
+
+        assert_eq(actual, expected)
+
+        actual = gdf.loc[:i]
+        expected = pdf.loc[:i]
+
+        assert_eq(actual, expected)

From 1e92f3f962cb27175e804889fd6d8c9be18b98c9 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 16 May 2024 09:01:46 -0400
Subject: [PATCH 233/272] Reduce runtime for ParquetChunkedReaderInputLimitTest
 gtests (#15672)

Reduces the runtime for the `ParquetChunkedReaderInputLimitTest.List` and `ParquetChunkedReaderInputLimitTest.Mixed` which together are 1/3 the total time for `PARQUET_TEST`.
These two tests produce multi-GB test files that are not strictly necessary for testing the chunked reader since the chunk sizes are controllable. The changes here reduce the runtime for these 2 tests by about 1/3 the original runtime.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Paul Mattione (https://github.com/pmattione-nvidia)

URL: https://github.com/rapidsai/cudf/pull/15672
---
 cpp/tests/io/parquet_chunked_reader_test.cu | 28 ++++++++++-----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu
index 58eee34a108..b3f3fac5a3d 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cu
+++ b/cpp/tests/io/parquet_chunked_reader_test.cu
@@ -1175,7 +1175,7 @@ TEST_F(ParquetChunkedReaderInputLimitTest, List)
   auto base_path      = temp_env->get_temp_filepath("list");
   auto test_filenames = input_limit_get_test_names(base_path);
 
-  constexpr int num_rows  = 50'000'000;
+  constexpr int num_rows  = 10'000'000;
   constexpr int list_size = 4;
 
   auto const stream = cudf::get_default_stream();
@@ -1225,14 +1225,14 @@ TEST_F(ParquetChunkedReaderInputLimitTest, List)
   //
   // Note that in the dictionary cases, both of these revert down to 1 chunk because the
   // dictionaries dramatically shrink the size of the uncompressed data.
-  constexpr int expected_a[] = {2, 2, 1, 1};
-  input_limit_test_read(test_filenames, tbl, 0, size_t{2} * 1024 * 1024 * 1024, expected_a);
+  constexpr int expected_a[] = {3, 3, 1, 1};
+  input_limit_test_read(test_filenames, tbl, 0, 256 * 1024 * 1024, expected_a);
   // smaller limit
-  constexpr int expected_b[] = {6, 6, 2, 1};
-  input_limit_test_read(test_filenames, tbl, 0, 512 * 1024 * 1024, expected_b);
+  constexpr int expected_b[] = {5, 5, 2, 1};
+  input_limit_test_read(test_filenames, tbl, 0, 128 * 1024 * 1024, expected_b);
   // include output chunking as well
-  constexpr int expected_c[] = {11, 11, 9, 8};
-  input_limit_test_read(test_filenames, tbl, 128 * 1024 * 1024, 512 * 1024 * 1024, expected_c);
+  constexpr int expected_c[] = {10, 9, 8, 7};
+  input_limit_test_read(test_filenames, tbl, 32 * 1024 * 1024, 64 * 1024 * 1024, expected_c);
 }
 
 void tiny_list_rowgroup_test(bool just_list_col)
@@ -1318,7 +1318,7 @@ TEST_F(ParquetChunkedReaderInputLimitTest, Mixed)
   auto base_path      = temp_env->get_temp_filepath("mixed_types");
   auto test_filenames = input_limit_get_test_names(base_path);
 
-  constexpr int num_rows  = 50'000'000;
+  constexpr int num_rows  = 10'000'000;
   constexpr int list_size = 4;
   constexpr int str_size  = 3;
 
@@ -1400,12 +1400,12 @@ TEST_F(ParquetChunkedReaderInputLimitTest, Mixed)
   //
   // Note that in the dictionary cases, both of these revert down to 1 chunk because the
   // dictionaries dramatically shrink the size of the uncompressed data.
-  constexpr int expected_a[] = {3, 3, 1, 1};
-  input_limit_test_read(test_filenames, tbl, 0, size_t{2} * 1024 * 1024 * 1024, expected_a);
+  constexpr int expected_a[] = {5, 5, 2, 1};
+  input_limit_test_read(test_filenames, tbl, 0, 256 * 1024 * 1024, expected_a);
   // smaller limit
-  constexpr int expected_b[] = {10, 11, 4, 1};
-  input_limit_test_read(test_filenames, tbl, 0, 512 * 1024 * 1024, expected_b);
+  constexpr int expected_b[] = {10, 9, 3, 1};
+  input_limit_test_read(test_filenames, tbl, 0, 128 * 1024 * 1024, expected_b);
   // include output chunking as well
-  constexpr int expected_c[] = {20, 21, 15, 14};
-  input_limit_test_read(test_filenames, tbl, 128 * 1024 * 1024, 512 * 1024 * 1024, expected_c);
+  constexpr int expected_c[] = {20, 18, 15, 12};
+  input_limit_test_read(test_filenames, tbl, 32 * 1024 * 1024, 64 * 1024 * 1024, expected_c);
 }

From bf255cb0414a439d1d61a06040f9b3c4003579b8 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 16 May 2024 10:12:27 -0400
Subject: [PATCH 234/272] Fix split-record result list column offset type
 (#15707)

Fixes offsets type for list column returned by `cudf::strings::split_record` and `cudf::strings::split_record_re` when large-strings enabled. The list column offsets type must be INT32. The code was changed to use the appropriate `make_offsets_child_column` utility function.
Also added some `is_large_strings_enabled()` checks to check-overflow gtests.
This allows all current gtests to pass when the large-strings support environment variable is set.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15707
---
 cpp/src/strings/split/split.cuh            |  4 ++--
 cpp/src/strings/split/split_re.cu          |  2 +-
 cpp/tests/column/factories_test.cpp        |  4 +++-
 cpp/tests/copying/concatenate_tests.cpp    |  7 +++++--
 cpp/tests/strings/array_tests.cpp          |  3 +++
 cpp/tests/strings/factories_test.cu        | 18 ++++++------------
 cpp/tests/strings/repeat_strings_tests.cpp |  5 ++++-
 7 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index 160d1be3978..69a11aabfcd 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -365,8 +365,8 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
     });
 
   // create offsets from the counts for return to the caller
-  auto [offsets, total_tokens] = cudf::strings::detail::make_offsets_child_column(
-    token_counts.begin(), token_counts.end(), stream, mr);
+  auto [offsets, total_tokens] =
+    cudf::detail::make_offsets_child_column(token_counts.begin(), token_counts.end(), stream, mr);
   auto const d_tokens_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
 
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 4dfb3e9ea62..6785ab9c893 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -147,7 +147,7 @@ std::pair<rmm::device_uvector<string_index_pair>, std::unique_ptr<column>> gener
   auto const begin = cudf::detail::make_counting_transform_iterator(0, map_fn);
   auto const end   = begin + strings_count;
 
-  auto [offsets, total_tokens] = cudf::strings::detail::make_offsets_child_column(
+  auto [offsets, total_tokens] = cudf::detail::make_offsets_child_column(
     begin, end, stream, rmm::mr::get_current_device_resource());
   auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
 
diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp
index b06d097647d..afebc91dd73 100644
--- a/cpp/tests/column/factories_test.cpp
+++ b/cpp/tests/column/factories_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -761,6 +762,7 @@ TEST_F(ColumnFactoryTest, FromStructScalarNull) { struct_from_scalar(false); }
 
 TEST_F(ColumnFactoryTest, FromScalarErrors)
 {
+  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
   cudf::string_scalar ss("hello world");
   EXPECT_THROW(cudf::make_column_from_scalar(ss, 214748365), std::overflow_error);
 
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index a9bf22682cf..3b7bff69938 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -29,6 +29,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/filling.hpp>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -188,6 +189,8 @@ TEST_F(StringColumnTest, ConcatenateManyColumns)
 
 TEST_F(StringColumnTest, ConcatenateTooLarge)
 {
+  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+
   std::string big_str(1000000, 'a');  // 1 million bytes x 5 = 5 million bytes
   cudf::test::strings_column_wrapper input{big_str, big_str, big_str, big_str, big_str};
   std::vector<cudf::column_view> input_cols;
@@ -374,7 +377,7 @@ TEST_F(OverflowTest, OverflowTest)
   }
 
   // string column, overflow on chars
-  {
+  if (!cudf::strings::detail::is_large_strings_enabled()) {
     constexpr auto size = static_cast<cudf::size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
 
     // try and concatenate 6 string columns of with 1 billion chars in each
@@ -497,7 +500,7 @@ TEST_F(OverflowTest, Presliced)
   }
 
   // strings, overflow on chars
-  {
+  if (!cudf::strings::detail::is_large_strings_enabled()) {
     constexpr cudf::size_type total_chars_size = 1024 * 1024 * 1024;
     constexpr cudf::size_type string_size      = 64;
     constexpr cudf::size_type num_rows         = total_chars_size / string_size;
diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp
index b22d7257041..a1bb87a43fb 100644
--- a/cpp/tests/strings/array_tests.cpp
+++ b/cpp/tests/strings/array_tests.cpp
@@ -23,6 +23,7 @@
 #include <cudf/copying.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/sorting.hpp>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -152,6 +153,8 @@ TEST_F(StringsColumnTest, GatherZeroSizeStringsColumn)
 
 TEST_F(StringsColumnTest, GatherTooBig)
 {
+  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+
   std::vector<int8_t> h_chars(3000000);
   cudf::test::fixed_width_column_wrapper<int8_t> chars(h_chars.begin(), h_chars.end());
   cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets({0, 3000000});
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index 64123690aea..35d648f16e0 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -17,6 +17,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
@@ -96,18 +97,11 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
   EXPECT_EQ(strings_view.chars_size(cudf::get_default_stream()), memsize);
 
   // check string data
-  auto h_chars_data = cudf::detail::make_std_vector_sync(
-    cudf::device_span<char const>(strings_view.chars_begin(cudf::get_default_stream()),
-                                  strings_view.chars_size(cudf::get_default_stream())),
-    cudf::get_default_stream());
-  auto h_offsets_data = cudf::detail::make_std_vector_sync(
-    cudf::device_span<cudf::size_type const>(
-      strings_view.offsets().data<cudf::size_type>() + strings_view.offset(),
-      strings_view.size() + 1),
-    cudf::get_default_stream());
-  EXPECT_EQ(memcmp(h_buffer.data(), h_chars_data.data(), h_buffer.size()), 0);
-  EXPECT_EQ(
-    memcmp(h_offsets.data(), h_offsets_data.data(), h_offsets.size() * sizeof(cudf::size_type)), 0);
+  cudf::test::strings_column_wrapper expected(
+    h_test_strings.begin(),
+    h_test_strings.end(),
+    cudf::test::iterators::nulls_from_nullptrs(h_test_strings));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(column->view(), expected);
 }
 
 TEST_F(StringsFactoriesTest, CreateColumnFromOffsets)
diff --git a/cpp/tests/strings/repeat_strings_tests.cpp b/cpp/tests/strings/repeat_strings_tests.cpp
index 9d08ac9c00c..0539895c5f4 100644
--- a/cpp/tests/strings/repeat_strings_tests.cpp
+++ b/cpp/tests/strings/repeat_strings_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
@@ -220,6 +221,8 @@ TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesInvalidInput)
 
 TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesOverflowOutput)
 {
+  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+
   auto const strs    = strs_col{"1", "12", "123", "1234", "12345", "123456", "1234567"};
   auto const strs_cv = cudf::strings_column_view(strs);
 

From c7fe7fe30853763ed790d0396129e33a583b47e8 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 16 May 2024 10:43:58 -0400
Subject: [PATCH 235/272] Fix multibyte check for case convert for large
 strings (#15721)

Fixes check for multibyte characters on large strings column. The `thrust::count_if` exceeds the int64 reduce type maximum and so the logic was recoded as a native kernel. Added additional tests and fixed subsequent errors where kernels are launched with greater than max(size_type) threads.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/15721
---
 cpp/benchmarks/string/case.cpp                |  2 +-
 cpp/src/strings/case.cu                       | 88 ++++++++++++++-----
 cpp/src/strings/copying/concatenate.cu        | 18 ++--
 cpp/tests/CMakeLists.txt                      |  3 +-
 cpp/tests/large_strings/case_tests.cpp        | 52 +++++++++++
 cpp/tests/large_strings/concatenate_tests.cpp | 13 +++
 6 files changed, 141 insertions(+), 35 deletions(-)
 create mode 100644 cpp/tests/large_strings/case_tests.cpp

diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp
index a7db972d39f..cd4d3ca964b 100644
--- a/cpp/benchmarks/string/case.cpp
+++ b/cpp/benchmarks/string/case.cpp
@@ -75,5 +75,5 @@ void bench_case(nvbench::state& state)
 NVBENCH_BENCH(bench_case)
   .set_name("case")
   .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
   .add_string_axis("encoding", {"ascii", "utf8"});
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 77c014301ba..c1688d20791 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -34,6 +34,7 @@
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <cub/cub.cuh>
 #include <cuda/atomic>
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -237,13 +238,16 @@ CUDF_KERNEL void count_bytes_kernel(convert_char_fn converter,
   auto const d_str   = d_strings.element<string_view>(str_idx);
   auto const str_ptr = d_str.data();
 
+  // each thread processes 4 bytes
   size_type size = 0;
-  for (auto i = lane_idx; i < d_str.size_bytes(); i += cudf::detail::warp_size) {
-    auto const chr = str_ptr[i];
-    if (is_utf8_continuation_char(chr)) { continue; }
-    char_utf8 u8 = 0;
-    to_char_utf8(str_ptr + i, u8);
-    size += converter.process_character(u8);
+  for (auto i = lane_idx * 4; i < d_str.size_bytes(); i += cudf::detail::warp_size * 4) {
+    for (auto j = i; (j < (i + 4)) && (j < d_str.size_bytes()); j++) {
+      auto const chr = str_ptr[j];
+      if (is_utf8_continuation_char(chr)) { continue; }
+      char_utf8 u8 = 0;
+      to_char_utf8(str_ptr + j, u8);
+      size += converter.process_character(u8);
+    }
   }
   // this is slightly faster than using the cub::warp_reduce
   if (size > 0) {
@@ -260,6 +264,41 @@ struct ascii_converter_fn {
   __device__ char operator()(char chr) { return converter.process_ascii(chr); }
 };
 
+constexpr int64_t block_size       = 512;
+constexpr int64_t bytes_per_thread = 8;
+
+/**
+ * @brief Checks the chars data for any multibyte characters
+ *
+ * The output count is not accurate but it is only checked for > 0.
+ */
+CUDF_KERNEL void has_multibytes_kernel(char const* d_input_chars,
+                                       int64_t first_offset,
+                                       int64_t last_offset,
+                                       int64_t* d_output)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  // read only every 2nd byte; all bytes in a multibyte char have high bit set
+  auto const byte_idx = (static_cast<int64_t>(idx) * bytes_per_thread) + first_offset;
+  auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);
+
+  using block_reduce = cub::BlockReduce<int64_t, block_size>;
+  __shared__ typename block_reduce::TempStorage temp_storage;
+
+  // each thread processes 8 bytes (only 4 need to be checked)
+  int64_t mb_count = 0;
+  for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < last_offset); i += 2) {
+    u_char const chr = static_cast<u_char>(d_input_chars[i]);
+    mb_count += ((chr & 0x80) > 0);
+  }
+  auto const mb_total = block_reduce(temp_storage).Reduce(mb_count, cub::Sum());
+
+  if ((lane_idx == 0) && (mb_total > 0)) {
+    cuda::atomic_ref<int64_t, cuda::thread_scope_block> ref{*d_output};
+    ref.fetch_add(mb_total, cuda::std::memory_order_relaxed);
+  }
+}
+
 /**
  * @brief Utility method for converting upper and lower case characters
  * in a strings column
@@ -289,7 +328,8 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
                                                       input.offsets(), input.offset(), stream);
   auto const last_offset =
     cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream);
-  auto const chars_size = last_offset - first_offset;
+  auto const chars_size  = last_offset - first_offset;
+  auto const input_chars = input.chars_begin(stream);
 
   convert_char_fn ccfn{case_flag, d_flags, d_cases, d_special};
   upper_lower_fn converter{ccfn, *d_strings};
@@ -306,16 +346,15 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
 
   // Check if the input contains any multi-byte characters.
   // This check incurs ~20% performance hit for smaller strings and so we only use it
-  // after the threshold check above. The check makes very little impact for larger strings
+  // after the threshold check above. The check makes very little impact for long strings
   // but results in a large performance gain when the input contains only single-byte characters.
-  // The count_if is faster than any_of or all_of: https://github.com/NVIDIA/thrust/issues/1016
-  bool const multi_byte_chars =
-    thrust::count_if(rmm::exec_policy(stream),
-                     input.chars_begin(stream),
-                     input.chars_end(stream),
-                     cuda::proclaim_return_type<bool>(
-                       [] __device__(auto chr) { return is_utf8_continuation_char(chr); })) > 0;
-  if (!multi_byte_chars) {
+  rmm::device_scalar<int64_t> mb_count(0, stream);
+  // cudf::detail::grid_1d is limited to size_type elements
+  auto const num_blocks = util::div_rounding_up_safe(chars_size / bytes_per_thread, block_size);
+  // we only need to check every other byte since either will contain high bit
+  has_multibytes_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
+    input_chars, first_offset, last_offset, mb_count.data());
+  if (mb_count.value(stream) == 0) {
     // optimization for ASCII-only case: copy the input column and inplace replace each character
     auto result  = std::make_unique<column>(input.parent(), stream, mr);
     auto d_chars = result->mutable_view().head<char>();
@@ -329,21 +368,21 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
   // note: tried to use segmented-reduce approach instead here and it was consistently slower
   auto [offsets, bytes] = [&] {
     rmm::device_uvector<size_type> sizes(input.size(), stream);
-    constexpr int block_size = 512;
-    cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
-    count_bytes_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+    // cudf::detail::grid_1d is limited to size_type threads
+    auto const num_blocks = util::div_rounding_up_safe(
+      static_cast<int64_t>(input.size()) * cudf::detail::warp_size, block_size);
+    count_bytes_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
       ccfn, *d_strings, sizes.data());
     // convert sizes to offsets
     return cudf::strings::detail::make_offsets_child_column(sizes.begin(), sizes.end(), stream, mr);
   }();
 
   // build sub-offsets
-  auto const input_chars = input.chars_begin(stream);
-  auto const sub_count   = chars_size / LS_SUB_BLOCK_SIZE;
-  auto tmp_offsets       = rmm::device_uvector<int64_t>(sub_count + input.size() + 1, stream);
+  auto const sub_count = chars_size / LS_SUB_BLOCK_SIZE;
+  auto tmp_offsets     = rmm::device_uvector<int64_t>(sub_count + input.size() + 1, stream);
   {
-    rmm::device_uvector<size_type> sub_offsets(sub_count, stream);
-    auto const count_itr = thrust::make_counting_iterator<size_type>(0);
+    rmm::device_uvector<int64_t> sub_offsets(sub_count, stream);
+    auto const count_itr = thrust::make_counting_iterator<int64_t>(0);
     thrust::transform(rmm::exec_policy_nosync(stream),
                       count_itr,
                       count_itr + sub_count,
@@ -359,6 +398,7 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
                   sub_offsets.begin(),
                   sub_offsets.end(),
                   tmp_offsets.begin());
+    stream.synchronize();  // protect against destruction of sub_offsets
   }
 
   // run case conversion over the new sub-strings
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 5daacbdc2fa..7622e39e735 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -265,15 +265,15 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
     // Use a heuristic to guess when the fused kernel will be faster than memcpy
     if (use_fused_kernel_heuristic(has_nulls, total_bytes, columns.size())) {
       // Use single kernel launch to copy chars columns
-      constexpr size_type block_size{256};
-      cudf::detail::grid_1d config(total_bytes, block_size);
-      auto const kernel = fused_concatenate_string_chars_kernel;
-      kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
-        d_views,
-        d_partition_offsets.data(),
-        static_cast<size_type>(columns.size()),
-        total_bytes,
-        d_new_chars);
+      constexpr size_t block_size{256};
+      // cudf::detail::grid_1d limited to size_type elements
+      auto const num_blocks = util::div_rounding_up_safe(total_bytes, block_size);
+      auto const kernel     = fused_concatenate_string_chars_kernel;
+      kernel<<<num_blocks, block_size, 0, stream.value()>>>(d_views,
+                                                            d_partition_offsets.data(),
+                                                            static_cast<size_type>(columns.size()),
+                                                            total_bytes,
+                                                            d_new_chars);
     } else {
       // Memcpy each input chars column (more efficient for very large strings)
       for (auto column = columns.begin(); column != columns.end(); ++column) {
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index c2982c478cd..db934818ae7 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -572,9 +572,10 @@ ConfigureTest(
 # * large strings test ----------------------------------------------------------------------------
 ConfigureTest(
   LARGE_STRINGS_TEST
+  large_strings/concatenate_tests.cpp
+  large_strings/case_tests.cpp
   large_strings/large_strings_fixture.cpp
   large_strings/merge_tests.cpp
-  large_strings/concatenate_tests.cpp
   large_strings/parquet_tests.cpp
   large_strings/reshape_tests.cpp
   GPUS 1
diff --git a/cpp/tests/large_strings/case_tests.cpp b/cpp/tests/large_strings/case_tests.cpp
new file mode 100644
index 00000000000..e56d984421a
--- /dev/null
+++ b/cpp/tests/large_strings/case_tests.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+
+#include <cudf/concatenate.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/strings/case.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <vector>
+
+struct CaseTest : public cudf::test::StringsLargeTest {};
+
+TEST_F(CaseTest, ToLower)
+{
+  auto const wide = this->wide_column();
+  auto input      = cudf::concatenate(std::vector<cudf::column_view>(120000, wide));  // 230MB
+  auto expected   = cudf::strings::to_lower(cudf::strings_column_view(input->view()));
+
+  int const multiplier = 12;
+  std::vector<cudf::column_view> input_cols(multiplier, input->view());
+  std::vector<cudf::size_type> splits;
+  std::generate_n(std::back_inserter(splits), multiplier - 1, [&input, n = 1]() mutable {
+    return input->view().size() * (n++);
+  });
+
+  auto large_input = cudf::concatenate(input_cols);  // 2700MB > 2GB
+  auto const sv    = cudf::strings_column_view(large_input->view());
+  auto result      = cudf::strings::to_lower(sv);
+
+  // verify results in sections
+  auto sliced = cudf::split(result->view(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, expected->view());
+  }
+}
diff --git a/cpp/tests/large_strings/concatenate_tests.cpp b/cpp/tests/large_strings/concatenate_tests.cpp
index aa445bf761b..89be2c307bf 100644
--- a/cpp/tests/large_strings/concatenate_tests.cpp
+++ b/cpp/tests/large_strings/concatenate_tests.cpp
@@ -63,3 +63,16 @@ TEST_F(ConcatenateTest, ConcatenateVertical)
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
   }
 }
+
+TEST_F(ConcatenateTest, ManyColumns)
+{
+  auto input           = this->wide_column();
+  auto view            = cudf::column_view(input);
+  int const multiplier = 1200000;
+  std::vector<cudf::column_view> input_cols(multiplier, view);  // 2500MB > 2GB
+  // this tests a unique path through the code
+  auto result = cudf::concatenate(input_cols);
+  auto sv     = cudf::strings_column_view(result->view());
+  EXPECT_EQ(sv.size(), view.size() * multiplier);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+}

From 47ed34551f860cb2bcc187d806a5d7612fbea38d Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Thu, 16 May 2024 10:47:57 -0500
Subject: [PATCH 236/272] Fix parquet predicate filtering with column
 projection (#15113)

Fixes #15051

The predicate filtering in parquet did not work while column projection is used. This PR fixes that limitation.

With this PR change, the user will be able to use both column name reference and column index reference in the filter.
- column name reference: the filters may specify any columns by name even if they are not present in column projection.
- column reference (index): The indices used should be the indices of output columns in the requested order.

This is achieved by extracting column names from filter and add to output buffers, after predicate filtering is done, these filter-only columns are removed and only requested columns are returned.
The change includes reading only output columns' statistics data instead of all root columns.

Summary of changes:
- `get_column_names_in_expression` extracts column names in filter.
- The extra columns in filter are added to output buffers during reader initialization
  - `cpp/src/io/parquet/reader_impl_helpers.cpp`, `cpp/src/io/parquet/reader_impl.cpp`
- instead of extracting statistics data of all root columns, it extracts for only output columns (including columns in filter)
  - `cpp/src/io/parquet/predicate_pushdown.cpp`
  - To do this, output column schemas and its dtypes should be cached.
  - statistics data extraction code is updated to check for `schema_idx` in row group metadata.
  - No need to convert filter again for all root columns, reuse the passed output columns reference filter.
  - Rest of the code is same.
- After the output filter predicate is calculated, these filter-only columns are removed
- moved `named_to_reference_converter` constructor to cpp, and remove used constructor.
- small include<> cleanup

Authors:
  - Karthikeyan (https://github.com/karthikeyann)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/15113
---
 cpp/include/cudf/io/parquet.hpp              |  29 +++-
 cpp/src/io/parquet/predicate_pushdown.cpp    | 139 ++++++++++++++++---
 cpp/src/io/parquet/reader_impl.cpp           |  20 ++-
 cpp/src/io/parquet/reader_impl.hpp           |   3 +
 cpp/src/io/parquet/reader_impl_chunking.cu   |   1 +
 cpp/src/io/parquet/reader_impl_helpers.cpp   |  37 +++--
 cpp/src/io/parquet/reader_impl_helpers.hpp   |  44 +++---
 cpp/src/io/parquet/reader_impl_preprocess.cu |  18 ++-
 cpp/tests/io/parquet_reader_test.cpp         |  50 +++++++
 9 files changed, 276 insertions(+), 65 deletions(-)

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 7f034668e43..b2f949cdcee 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -205,6 +205,31 @@ class parquet_reader_options {
   /**
    * @brief Sets AST based filter for predicate pushdown.
    *
+   * The filter can utilize cudf::ast::column_name_reference to reference a column by its name,
+   * even if it's not necessarily present in the requested projected columns.
+   * To refer to output column indices, you can use cudf::ast::column_reference.
+   *
+   * For a parquet with columns ["A", "B", "C", ... "X", "Y", "Z"],
+   * Example 1: with/without column projection
+   * @code
+   * use_columns({"A", "X", "Z"})
+   * .filter(operation(ast_operator::LESS, column_name_reference{"C"}, literal{100}));
+   * @endcode
+   * Column "C" need not be present in output table.
+   * Example 2: without column projection
+   * @code
+   * filter(operation(ast_operator::LESS, column_reference{1}, literal{100}));
+   * @endcode
+   * Here, `1` will refer to column "B" because output will contain all columns in
+   * order ["A", ..., "Z"].
+   * Example 3: with column projection
+   * @code
+   * use_columns({"A", "Z", "X"})
+   * .filter(operation(ast_operator::LESS, column_reference{1}, literal{100}));
+   * @endcode
+   * Here, `1` will refer to column "Z" because output will contain 3 columns in
+   * order ["A", "Z", "X"].
+   *
    * @param filter AST expression to use as filter
    */
   void set_filter(ast::expression const& filter) { _filter = filter; }
@@ -309,9 +334,7 @@ class parquet_reader_options_builder {
   }
 
   /**
-   * @brief Sets vector of individual row groups to read.
-   *
-   * @param filter Vector of row groups to read
+   * @copydoc parquet_reader_options::set_filter
    * @return this for chaining
    */
   parquet_reader_options_builder& filter(ast::expression const& filter)
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 9869dafadfb..0109be661a7 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -31,10 +31,12 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <thrust/iterator/counting_iterator.h>
+
 #include <algorithm>
-#include <list>
 #include <numeric>
 #include <optional>
+#include <unordered_set>
 
 namespace cudf::io::parquet::detail {
 
@@ -127,7 +129,7 @@ struct stats_caster {
   // Creates device columns from column statistics (min, max)
   template <typename T>
   std::pair<std::unique_ptr<column>, std::unique_ptr<column>> operator()(
-    size_t col_idx,
+    int schema_idx,
     cudf::data_type dtype,
     rmm::cuda_stream_view stream,
     rmm::device_async_resource_ref mr) const
@@ -206,22 +208,31 @@ struct stats_caster {
       };  // local struct host_column
       host_column min(total_row_groups);
       host_column max(total_row_groups);
-
       size_type stats_idx = 0;
       for (size_t src_idx = 0; src_idx < row_group_indices.size(); ++src_idx) {
         for (auto const rg_idx : row_group_indices[src_idx]) {
           auto const& row_group = per_file_metadata[src_idx].row_groups[rg_idx];
-          auto const& colchunk  = row_group.columns[col_idx];
-          // To support deprecated min, max fields.
-          auto const& min_value = colchunk.meta_data.statistics.min_value.has_value()
-                                    ? colchunk.meta_data.statistics.min_value
-                                    : colchunk.meta_data.statistics.min;
-          auto const& max_value = colchunk.meta_data.statistics.max_value.has_value()
-                                    ? colchunk.meta_data.statistics.max_value
-                                    : colchunk.meta_data.statistics.max;
-          // translate binary data to Type then to <T>
-          min.set_index(stats_idx, min_value, colchunk.meta_data.type);
-          max.set_index(stats_idx, max_value, colchunk.meta_data.type);
+          auto col              = std::find_if(
+            row_group.columns.begin(),
+            row_group.columns.end(),
+            [schema_idx](ColumnChunk const& col) { return col.schema_idx == schema_idx; });
+          if (col != std::end(row_group.columns)) {
+            auto const& colchunk = *col;
+            // To support deprecated min, max fields.
+            auto const& min_value = colchunk.meta_data.statistics.min_value.has_value()
+                                      ? colchunk.meta_data.statistics.min_value
+                                      : colchunk.meta_data.statistics.min;
+            auto const& max_value = colchunk.meta_data.statistics.max_value.has_value()
+                                      ? colchunk.meta_data.statistics.max_value
+                                      : colchunk.meta_data.statistics.max;
+            // translate binary data to Type then to <T>
+            min.set_index(stats_idx, min_value, colchunk.meta_data.type);
+            max.set_index(stats_idx, max_value, colchunk.meta_data.type);
+          } else {
+            // Marking it null, if column present in row group
+            min.set_index(stats_idx, thrust::nullopt, {});
+            max.set_index(stats_idx, thrust::nullopt, {});
+          }
           stats_idx++;
         }
       };
@@ -378,6 +389,7 @@ class stats_expression_converter : public ast::detail::expression_transformer {
 std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::filter_row_groups(
   host_span<std::vector<size_type> const> row_group_indices,
   host_span<data_type const> output_dtypes,
+  host_span<int const> output_column_schemas,
   std::reference_wrapper<ast::expression const> filter,
   rmm::cuda_stream_view stream) const
 {
@@ -412,7 +424,8 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
   std::vector<std::unique_ptr<column>> columns;
   stats_caster stats_col{total_row_groups, per_file_metadata, input_row_group_indices};
   for (size_t col_idx = 0; col_idx < output_dtypes.size(); col_idx++) {
-    auto const& dtype = output_dtypes[col_idx];
+    auto const schema_idx = output_column_schemas[col_idx];
+    auto const& dtype     = output_dtypes[col_idx];
     // Only comparable types except fixed point are supported.
     if (cudf::is_compound(dtype) && dtype.id() != cudf::type_id::STRING) {
       // placeholder only for unsupported types.
@@ -423,14 +436,14 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
       continue;
     }
     auto [min_col, max_col] =
-      cudf::type_dispatcher<dispatch_storage_type>(dtype, stats_col, col_idx, dtype, stream, mr);
+      cudf::type_dispatcher<dispatch_storage_type>(dtype, stats_col, schema_idx, dtype, stream, mr);
     columns.push_back(std::move(min_col));
     columns.push_back(std::move(max_col));
   }
   auto stats_table = cudf::table(std::move(columns));
 
   // Converts AST to StatsAST with reference to min, max columns in above `stats_table`.
-  stats_expression_converter stats_expr{filter, static_cast<size_type>(output_dtypes.size())};
+  stats_expression_converter stats_expr{filter.get(), static_cast<size_type>(output_dtypes.size())};
   auto stats_ast     = stats_expr.get_stats_expr();
   auto predicate_col = cudf::detail::compute_column(stats_table, stats_ast.get(), stream, mr);
   auto predicate     = predicate_col->view();
@@ -475,6 +488,20 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
 }
 
 // convert column named expression to column index reference expression
+named_to_reference_converter::named_to_reference_converter(
+  std::optional<std::reference_wrapper<ast::expression const>> expr, table_metadata const& metadata)
+{
+  if (!expr.has_value()) return;
+  // create map for column name.
+  std::transform(metadata.schema_info.cbegin(),
+                 metadata.schema_info.cend(),
+                 thrust::counting_iterator<size_t>(0),
+                 std::inserter(column_name_to_index, column_name_to_index.end()),
+                 [](auto const& sch, auto index) { return std::make_pair(sch.name, index); });
+
+  expr.value().get().accept(*this);
+}
+
 std::reference_wrapper<ast::expression const> named_to_reference_converter::visit(
   ast::literal const& expr)
 {
@@ -530,4 +557,82 @@ named_to_reference_converter::visit_operands(
   return transformed_operands;
 }
 
+/**
+ * @brief Converts named columns to index reference columns
+ *
+ */
+class names_from_expression : public ast::detail::expression_transformer {
+ public:
+  names_from_expression(std::optional<std::reference_wrapper<ast::expression const>> expr,
+                        std::vector<std::string> const& skip_names)
+    : _skip_names(skip_names.cbegin(), skip_names.cend())
+  {
+    if (!expr.has_value()) return;
+    expr.value().get().accept(*this);
+  }
+
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::literal const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(ast::literal const& expr) override
+  {
+    return expr;
+  }
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::column_reference const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(ast::column_reference const& expr) override
+  {
+    return expr;
+  }
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::column_name_reference const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(
+    ast::column_name_reference const& expr) override
+  {
+    // collect column names
+    auto col_name = expr.get_column_name();
+    if (_skip_names.count(col_name) == 0) { _column_names.insert(col_name); }
+    return expr;
+  }
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::operation const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(ast::operation const& expr) override
+  {
+    visit_operands(expr.get_operands());
+    return expr;
+  }
+
+  /**
+   * @brief Returns the column names in AST.
+   *
+   * @return AST operation expression
+   */
+  [[nodiscard]] std::vector<std::string> to_vector() &&
+  {
+    return {std::make_move_iterator(_column_names.begin()),
+            std::make_move_iterator(_column_names.end())};
+  }
+
+ private:
+  void visit_operands(std::vector<std::reference_wrapper<ast::expression const>> operands)
+  {
+    for (auto const& operand : operands) {
+      operand.get().accept(*this);
+    }
+  }
+
+  std::unordered_set<std::string> _column_names;
+  std::unordered_set<std::string> _skip_names;
+};
+
+[[nodiscard]] std::vector<std::string> get_column_names_in_expression(
+  std::optional<std::reference_wrapper<ast::expression const>> expr,
+  std::vector<std::string> const& skip_names)
+{
+  return names_from_expression(expr, skip_names).to_vector();
+}
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 5b7c180195b..b0d19ad00f3 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -26,6 +26,8 @@
 
 #include <rmm/resource_ref.hpp>
 
+#include <thrust/iterator/counting_iterator.h>
+
 #include <bitset>
 #include <numeric>
 
@@ -436,9 +438,18 @@ reader::impl::impl(std::size_t chunk_read_limit,
   // Binary columns can be read as binary or strings
   _reader_column_schema = options.get_column_schema();
 
-  // Select only columns required by the options
+  // Select only columns required by the options and filter
+  std::optional<std::vector<std::string>> filter_columns_names;
+  if (options.get_filter().has_value() and options.get_columns().has_value()) {
+    // list, struct, dictionary are not supported by AST filter yet.
+    // extract columns not present in get_columns() & keep count to remove at end.
+    filter_columns_names =
+      get_column_names_in_expression(options.get_filter(), *(options.get_columns()));
+    _num_filter_only_columns = filter_columns_names->size();
+  }
   std::tie(_input_columns, _output_buffers, _output_column_schemas) =
     _metadata->select_columns(options.get_columns(),
+                              filter_columns_names,
                               options.is_enabled_use_pandas_metadata(),
                               _strings_to_categorical,
                               _timestamp_type.id());
@@ -572,7 +583,12 @@ table_with_metadata reader::impl::finalize_output(
       *read_table, filter.value().get(), _stream, rmm::mr::get_current_device_resource());
     CUDF_EXPECTS(predicate->view().type().id() == type_id::BOOL8,
                  "Predicate filter should return a boolean");
-    auto output_table = cudf::detail::apply_boolean_mask(*read_table, *predicate, _stream, _mr);
+    // Exclude columns present in filter only in output
+    auto counting_it        = thrust::make_counting_iterator<std::size_t>(0);
+    auto const output_count = read_table->num_columns() - _num_filter_only_columns;
+    auto only_output        = read_table->select(counting_it, counting_it + output_count);
+    auto output_table = cudf::detail::apply_boolean_mask(only_output, *predicate, _stream, _mr);
+    if (_num_filter_only_columns > 0) { out_metadata.schema_info.resize(output_count); }
     return {std::move(output_table), std::move(out_metadata)};
   }
   return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 6c6cedf4e76..b67d2e312d7 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -368,6 +368,9 @@ class reader::impl {
   // _output_buffers associated metadata
   std::unique_ptr<table_metadata> _output_metadata;
 
+  // number of extra filter columns
+  std::size_t _num_filter_only_columns{0};
+
   bool _strings_to_categorical = false;
 
   // are there usable page indexes available
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index f4fb6bc57e6..6824d72cf04 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "compact_protocol_reader.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/utilities/config_utils.hpp"
 #include "io/utilities/time_utils.cuh"
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index dfbc8c565ad..eb653c6b9ac 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -16,6 +16,7 @@
 
 #include "reader_impl_helpers.hpp"
 
+#include "compact_protocol_reader.hpp"
 #include "io/parquet/parquet.hpp"
 #include "io/utilities/base64_utilities.hpp"
 #include "io/utilities/row_selection.hpp"
@@ -25,6 +26,7 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 
+#include <functional>
 #include <numeric>
 #include <regex>
 
@@ -954,13 +956,15 @@ aggregate_reader_metadata::select_row_groups(
   int64_t skip_rows_opt,
   std::optional<size_type> const& num_rows_opt,
   host_span<data_type const> output_dtypes,
+  host_span<int const> output_column_schemas,
   std::optional<std::reference_wrapper<ast::expression const>> filter,
   rmm::cuda_stream_view stream) const
 {
   std::optional<std::vector<std::vector<size_type>>> filtered_row_group_indices;
+  // if filter is not empty, then gather row groups to read after predicate pushdown
   if (filter.has_value()) {
-    filtered_row_group_indices =
-      filter_row_groups(row_group_indices, output_dtypes, filter.value(), stream);
+    filtered_row_group_indices = filter_row_groups(
+      row_group_indices, output_dtypes, output_column_schemas, filter.value(), stream);
     if (filtered_row_group_indices.has_value()) {
       row_group_indices =
         host_span<std::vector<size_type> const>(filtered_row_group_indices.value());
@@ -1017,10 +1021,12 @@ aggregate_reader_metadata::select_row_groups(
 std::tuple<std::vector<input_column_info>,
            std::vector<cudf::io::detail::inline_column_buffer>,
            std::vector<size_type>>
-aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>> const& use_names,
-                                          bool include_index,
-                                          bool strings_to_categorical,
-                                          type_id timestamp_type_id) const
+aggregate_reader_metadata::select_columns(
+  std::optional<std::vector<std::string>> const& use_names,
+  std::optional<std::vector<std::string>> const& filter_columns_names,
+  bool include_index,
+  bool strings_to_categorical,
+  type_id timestamp_type_id) const
 {
   auto find_schema_child = [&](SchemaElement const& schema_elem, std::string const& name) {
     auto const& col_schema_idx =
@@ -1184,13 +1190,18 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
 
     // Find which of the selected paths are valid and get their schema index
     std::vector<path_info> valid_selected_paths;
-    for (auto const& selected_path : *use_names) {
-      auto found_path =
-        std::find_if(all_paths.begin(), all_paths.end(), [&](path_info& valid_path) {
-          return valid_path.full_path == selected_path;
-        });
-      if (found_path != all_paths.end()) {
-        valid_selected_paths.push_back({selected_path, found_path->schema_idx});
+    // vector reference pushback (*use_names). If filter names passed.
+    std::vector<std::reference_wrapper<std::vector<std::string> const>> column_names{
+      *use_names, *filter_columns_names};
+    for (auto const& used_column_names : column_names) {
+      for (auto const& selected_path : used_column_names.get()) {
+        auto found_path =
+          std::find_if(all_paths.begin(), all_paths.end(), [&](path_info& valid_path) {
+            return valid_path.full_path == selected_path;
+          });
+        if (found_path != all_paths.end()) {
+          valid_selected_paths.push_back({selected_path, found_path->schema_idx});
+        }
       }
     }
 
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 398812945e2..9aeb19a7723 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include "compact_protocol_reader.hpp"
 #include "parquet_gpu.hpp"
 
 #include <cudf/ast/detail/expression_transformer.hpp>
@@ -25,9 +24,6 @@
 #include <cudf/io/datasource.hpp>
 #include <cudf/types.hpp>
 
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-
 #include <list>
 #include <tuple>
 #include <vector>
@@ -257,7 +253,8 @@ class aggregate_reader_metadata {
    * @brief Filters the row groups based on predicate filter
    *
    * @param row_group_indices Lists of row groups to read, one per source
-   * @param output_dtypes List of output column datatypes
+   * @param output_dtypes Datatypes of of output columns
+   * @param output_column_schemas schema indices of output columns
    * @param filter AST expression to filter row groups based on Column chunk statistics
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @return Filtered row group indices, if any is filtered.
@@ -265,6 +262,7 @@ class aggregate_reader_metadata {
   [[nodiscard]] std::optional<std::vector<std::vector<size_type>>> filter_row_groups(
     host_span<std::vector<size_type> const> row_group_indices,
     host_span<data_type const> output_dtypes,
+    host_span<int const> output_column_schemas,
     std::reference_wrapper<ast::expression const> filter,
     rmm::cuda_stream_view stream) const;
 
@@ -277,7 +275,8 @@ class aggregate_reader_metadata {
    * @param row_group_indices Lists of row groups to read, one per source
    * @param row_start Starting row of the selection
    * @param row_count Total number of rows selected
-   * @param output_dtypes List of output column datatypes
+   * @param output_dtypes Datatypes of of output columns
+   * @param output_column_schemas schema indices of output columns
    * @param filter Optional AST expression to filter row groups based on Column chunk statistics
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @return A tuple of corrected row_start, row_count and list of row group indexes and its
@@ -288,6 +287,7 @@ class aggregate_reader_metadata {
     int64_t row_start,
     std::optional<size_type> const& row_count,
     host_span<data_type const> output_dtypes,
+    host_span<int const> output_column_schemas,
     std::optional<std::reference_wrapper<ast::expression const>> filter,
     rmm::cuda_stream_view stream) const;
 
@@ -296,6 +296,7 @@ class aggregate_reader_metadata {
    *
    * @param use_names List of paths of column names to select; `nullopt` if user did not select
    * columns to read
+   * @param filter_columns_names List of paths of column names that are present only in filter
    * @param include_index Whether to always include the PANDAS index column(s)
    * @param strings_to_categorical Type conversion parameter
    * @param timestamp_type_id Type conversion parameter
@@ -307,6 +308,7 @@ class aggregate_reader_metadata {
                            std::vector<cudf::io::detail::inline_column_buffer>,
                            std::vector<size_type>>
   select_columns(std::optional<std::vector<std::string>> const& use_names,
+                 std::optional<std::vector<std::string>> const& filter_columns_names,
                  bool include_index,
                  bool strings_to_categorical,
                  type_id timestamp_type_id) const;
@@ -319,23 +321,7 @@ class aggregate_reader_metadata {
 class named_to_reference_converter : public ast::detail::expression_transformer {
  public:
   named_to_reference_converter(std::optional<std::reference_wrapper<ast::expression const>> expr,
-                               table_metadata const& metadata)
-    : metadata(metadata)
-  {
-    if (!expr.has_value()) return;
-    // create map for column name.
-    std::transform(
-      thrust::make_zip_iterator(metadata.schema_info.cbegin(),
-                                thrust::counting_iterator<size_t>(0)),
-      thrust::make_zip_iterator(metadata.schema_info.cend(),
-                                thrust::counting_iterator(metadata.schema_info.size())),
-      std::inserter(column_name_to_index, column_name_to_index.end()),
-      [](auto const& name_index) {
-        return std::make_pair(thrust::get<0>(name_index).name, thrust::get<1>(name_index));
-      });
-
-    expr.value().get().accept(*this);
-  }
+                               table_metadata const& metadata);
 
   /**
    * @copydoc ast::detail::expression_transformer::visit(ast::literal const& )
@@ -370,7 +356,6 @@ class named_to_reference_converter : public ast::detail::expression_transformer
   std::vector<std::reference_wrapper<ast::expression const>> visit_operands(
     std::vector<std::reference_wrapper<ast::expression const>> operands);
 
-  table_metadata const& metadata;
   std::unordered_map<std::string, size_type> column_name_to_index;
   std::optional<std::reference_wrapper<ast::expression const>> _stats_expr;
   // Using std::list or std::deque to avoid reference invalidation
@@ -378,4 +363,15 @@ class named_to_reference_converter : public ast::detail::expression_transformer
   std::list<ast::operation> _operators;
 };
 
+/**
+ * @brief Get the column names in expression object
+ *
+ * @param expr The optional expression object to get the column names from
+ * @param skip_names The names of column names to skip in returned column names
+ * @return The column names present in expression object except the skip_names
+ */
+[[nodiscard]] std::vector<std::string> get_column_names_in_expression(
+  std::optional<std::reference_wrapper<ast::expression const>> expr,
+  std::vector<std::string> const& skip_names);
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index a5cd7d06536..084f82a2ca0 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1230,17 +1230,23 @@ void reader::impl::preprocess_file(
   CUDF_EXPECTS(!_file_preprocessed, "Attempted to preprocess file more than once");
 
   // if filter is not empty, then create output types as vector and pass for filtering.
-  std::vector<data_type> output_types;
+  std::vector<data_type> output_dtypes;
   if (filter.has_value()) {
-    std::transform(_output_buffers.cbegin(),
-                   _output_buffers.cend(),
-                   std::back_inserter(output_types),
+    std::transform(_output_buffers_template.cbegin(),
+                   _output_buffers_template.cend(),
+                   std::back_inserter(output_dtypes),
                    [](auto const& col) { return col.type; });
   }
+
   std::tie(
     _file_itm_data.global_skip_rows, _file_itm_data.global_num_rows, _file_itm_data.row_groups) =
-    _metadata->select_row_groups(
-      row_group_indices, skip_rows, num_rows, output_types, filter, _stream);
+    _metadata->select_row_groups(row_group_indices,
+                                 skip_rows,
+                                 num_rows,
+                                 output_dtypes,
+                                 _output_column_schemas,
+                                 filter,
+                                 _stream);
 
   // check for page indexes
   _has_page_index = std::all_of(_file_itm_data.row_groups.begin(),
diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp
index 85ada9b38fc..aa9172b0608 100644
--- a/cpp/tests/io/parquet_reader_test.cpp
+++ b/cpp/tests/io/parquet_reader_test.cpp
@@ -1406,6 +1406,56 @@ TEST_F(ParquetReaderTest, FilterIdentity)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *result2.tbl);
 }
 
+TEST_F(ParquetReaderTest, FilterWithColumnProjection)
+{
+  // col_uint32, col_int64, col_double
+  auto [src, filepath] = create_parquet_with_stats("FilterWithColumnProjection.parquet");
+  auto val             = cudf::numeric_scalar<uint32_t>{10};
+  auto lit             = cudf::ast::literal{val};
+  auto col_ref         = cudf::ast::column_name_reference{"col_uint32"};
+  auto col_index       = cudf::ast::column_reference{0};
+  auto filter_expr     = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_index, lit);
+
+  auto predicate = cudf::compute_column(src, filter_expr);
+
+  {  // column_name_reference in parquet filter (not present in column projection)
+    auto read_expr       = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref, lit);
+    auto projected_table = cudf::table_view{{src.get_column(2)}};
+    auto expected        = cudf::apply_boolean_mask(projected_table, *predicate);
+
+    auto read_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+                       .columns({"col_double"})
+                       .filter(read_expr);
+    auto result = cudf::io::read_parquet(read_opts);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
+  }
+
+  {  // column_reference in parquet filter (indices as per order of column projection)
+    auto col_index2    = cudf::ast::column_reference{1};
+    auto read_ref_expr = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_index2, lit);
+
+    auto projected_table = cudf::table_view{{src.get_column(2), src.get_column(0)}};
+    auto expected        = cudf::apply_boolean_mask(projected_table, *predicate);
+    auto read_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+                       .columns({"col_double", "col_uint32"})
+                       .filter(read_ref_expr);
+    auto result = cudf::io::read_parquet(read_opts);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
+  }
+
+  // Error cases
+  {  // column_reference is not same type as literal, column_reference index is out of bounds
+    for (auto const index : {0, 2}) {
+      auto col_index2    = cudf::ast::column_reference{index};
+      auto read_ref_expr = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_index2, lit);
+      auto read_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+                         .columns({"col_double", "col_uint32"})
+                         .filter(read_ref_expr);
+      EXPECT_THROW(cudf::io::read_parquet(read_opts), cudf::logic_error);
+    }
+  }
+}
+
 TEST_F(ParquetReaderTest, FilterReferenceExpression)
 {
   auto [src, filepath] = create_parquet_with_stats("FilterReferenceExpression.parquet");

From fcbc1bc8a5d81797c4974ff7559eac44f3854697 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 16 May 2024 07:53:44 -1000
Subject: [PATCH 237/272] Fix id_vars and value_vars not accepting string
 scalars in melt (#15765)

closes #15758

Also fixes an inconsistency with pandas where `var_name` data was always a `Categorical` unlike pandas

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15765
---
 python/cudf/cudf/core/reshape.py       | 27 +++++++++++++-------------
 python/cudf/cudf/tests/test_reshape.py | 27 ++++++++++++++++++--------
 2 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 26d91bed173..0b44ab58f30 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -2,10 +2,8 @@
 
 import itertools
 import warnings
-from collections import abc
 from typing import Dict, Optional
 
-import cupy
 import numpy as np
 import pandas as pd
 
@@ -590,7 +588,7 @@ def melt(
 
     # id_vars
     if id_vars is not None:
-        if not isinstance(id_vars, abc.Sequence):
+        if cudf.api.types.is_scalar(id_vars):
             id_vars = [id_vars]
         id_vars = list(id_vars)
         missing = set(id_vars) - set(frame._column_names)
@@ -604,7 +602,7 @@ def melt(
 
     # value_vars
     if value_vars is not None:
-        if not isinstance(value_vars, abc.Sequence):
+        if cudf.api.types.is_scalar(value_vars):
             value_vars = [value_vars]
         value_vars = list(value_vars)
         missing = set(value_vars) - set(frame._column_names)
@@ -658,21 +656,22 @@ def _tile(A, reps):
     # Step 2: add variable
     nval = len(value_vars)
     dtype = min_unsigned_type(nval)
-    temp = cudf.Series(cupy.repeat(cupy.arange(nval, dtype=dtype), N))
 
     if not var_name:
         var_name = "variable"
 
-    mdata[var_name] = cudf.Series(
-        cudf.core.column.build_categorical_column(
-            categories=value_vars,
-            codes=temp._column,
-            mask=temp._column.base_mask,
-            size=temp._column.size,
-            offset=temp._column.offset,
-            ordered=False,
+    if not value_vars:
+        # TODO: Use frame._data.label_dtype when it's more consistently set
+        var_data = cudf.Series(
+            value_vars, dtype=frame._data.to_pandas_index().dtype
         )
-    )
+    else:
+        var_data = (
+            cudf.Series(value_vars)
+            .take(np.repeat(np.arange(nval, dtype=dtype), N))
+            .reset_index(drop=True)
+        )
+    mdata[var_name] = var_data
 
     # Step 3: add values
     mdata[value_name] = cudf.Series._concat(
diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
index d618669755d..daa1e70808f 100644
--- a/python/cudf/cudf/tests/test_reshape.py
+++ b/python/cudf/cudf/tests/test_reshape.py
@@ -8,7 +8,6 @@
 import pytest
 
 import cudf
-from cudf import melt as cudf_melt
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.core.buffer.spill_manager import get_global_manager
 from cudf.testing._utils import (
@@ -71,15 +70,10 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype):
 
     gdf = cudf.from_pandas(pdf)
 
-    got = cudf_melt(frame=gdf, id_vars=id_vars, value_vars=value_vars)
+    got = cudf.melt(frame=gdf, id_vars=id_vars, value_vars=value_vars)
     got_from_melt_method = gdf.melt(id_vars=id_vars, value_vars=value_vars)
 
     expect = pd.melt(frame=pdf, id_vars=id_vars, value_vars=value_vars)
-    # pandas' melt makes the 'variable' column of 'object' type (string)
-    # cuDF's melt makes it Categorical because it doesn't support strings
-    expect["variable"] = expect["variable"].astype(
-        got["variable"].dtype.to_pandas()
-    )
 
     assert_eq(expect, got)
 
@@ -98,11 +92,28 @@ def test_melt_many_columns():
     grid_df_d = cudf.melt(
         df_d, id_vars=["id"], var_name="d", value_name="sales"
     )
-    grid_df_d["d"] = grid_df_d["d"].astype("str")
+    grid_df_d["d"] = grid_df_d["d"]
 
     assert_eq(grid_df, grid_df_d)
 
 
+def test_melt_str_scalar_id_var():
+    data = {"index": [1, 2], "id": [1, 2], "d0": [10, 20], "d1": [30, 40]}
+    result = cudf.melt(
+        cudf.DataFrame(data),
+        id_vars="index",
+        var_name="column",
+        value_name="value",
+    )
+    expected = pd.melt(
+        pd.DataFrame(data),
+        id_vars="index",
+        var_name="column",
+        value_name="value",
+    )
+    assert_eq(result, expected)
+
+
 @pytest.mark.parametrize("num_cols", [1, 2, 10])
 @pytest.mark.parametrize("num_rows", [1, 2, 1000])
 @pytest.mark.parametrize(

From 49af2615ca81e65c991954ed905c4a6151fc88fd Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 16 May 2024 14:55:06 -0400
Subject: [PATCH 238/272] Update strings contains benchmarks to nvbench
 (#15495)

Reference #15405
Updates the benchmarks for `cudf::strings::contains()` to use nvbench and also introduce a hit-test axis.
The logic has been updated to remove the unneeded `fill()` call for long strings.
Also cleaned up code and updated logic to process 4 bytes per warp thread.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15495
---
 cpp/benchmarks/CMakeLists.txt  |   2 +-
 cpp/benchmarks/string/find.cpp | 109 ++++++++++++++++-----------------
 cpp/src/strings/search/find.cu |  33 +++++-----
 3 files changed, 73 insertions(+), 71 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index ac4cce02318..4586a12f466 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -301,7 +301,6 @@ ConfigureBench(
   string/copy.cu
   string/factory.cu
   string/filter.cpp
-  string/find.cpp
   string/repeat_strings.cpp
   string/replace.cpp
   string/slice.cpp
@@ -318,6 +317,7 @@ ConfigureNVBench(
   string/copy_range.cpp
   string/count.cpp
   string/extract.cpp
+  string/find.cpp
   string/gather.cpp
   string/join_strings.cpp
   string/lengths.cpp
diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp
index e866092f3a3..a9c620e4bf0 100644
--- a/cpp/benchmarks/string/find.cpp
+++ b/cpp/benchmarks/string/find.cpp
@@ -16,78 +16,75 @@
 
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
+#include <cudf/filling.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/combine.hpp>
 #include <cudf/strings/find.hpp>
 #include <cudf/strings/find_multiple.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-#include <limits>
+#include <nvbench/nvbench.cuh>
 
-enum FindAPI { find, find_multi, contains, starts_with, ends_with };
+std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
+                                                 cudf::size_type row_width,
+                                                 int32_t hit_rate);
 
-class StringFindScalar : public cudf::benchmark {};
-
-static void BM_find_scalar(benchmark::State& state, FindAPI find_api)
+static void bench_find_string(nvbench::state& state)
 {
-  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
-  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
-  data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
-  cudf::strings_column_view input(column->view());
-  cudf::string_scalar target("+");
-  cudf::test::strings_column_wrapper targets({"+", "-"});
+  auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const hit_rate  = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
+  auto const api       = state.get_string("api");
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    switch (find_api) {
-      case find: cudf::strings::find(input, target); break;
-      case find_multi:
-        cudf::strings::find_multiple(input, cudf::strings_column_view(targets));
-        break;
-      case contains: cudf::strings::contains(input, target); break;
-      case starts_with: cudf::strings::starts_with(input, target); break;
-      case ends_with: cudf::strings::ends_with(input, target); break;
-    }
+  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
   }
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
-}
+  auto const stream = cudf::get_default_stream();
+  auto const col    = build_input_column(n_rows, row_width, hit_rate);
+  auto const input  = cudf::strings_column_view(col->view());
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 1 << 5;
-  int const max_rowlen = 1 << 13;
-  int const len_mult   = 2;
-  for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) {
-    for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) {
-      // avoid generating combinations that exceed the cudf column limit
-      size_t total_chars = static_cast<size_t>(row_count) * rowlen;
-      if (total_chars < static_cast<size_t>(std::numeric_limits<cudf::size_type>::max())) {
-        b->Args({row_count, rowlen});
-      }
-    }
+  std::vector<std::string> h_targets({"5W", "5W43", "0987 5W43"});
+  cudf::string_scalar target(h_targets[2]);
+  cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto const chars_size = input.chars_size(stream);
+  state.add_element_count(chars_size, "chars_size");
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  if (api.substr(0, 4) == "find") {
+    state.add_global_memory_writes<nvbench::int32_t>(input.size());
+  } else {
+    state.add_global_memory_writes<nvbench::int8_t>(input.size());
   }
-}
 
-#define STRINGS_BENCHMARK_DEFINE(name)                    \
-  BENCHMARK_DEFINE_F(StringFindScalar, name)              \
-  (::benchmark::State & st) { BM_find_scalar(st, name); } \
-  BENCHMARK_REGISTER_F(StringFindScalar, name)            \
-    ->Apply(generate_bench_args)                          \
-    ->UseManualTime()                                     \
-    ->Unit(benchmark::kMillisecond);
+  if (api == "find") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::find(input, target); });
+  } else if (api == "find_multi") {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      cudf::strings::find_multiple(input, cudf::strings_column_view(targets));
+    });
+  } else if (api == "contains") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::contains(input, target); });
+  } else if (api == "starts_with") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::starts_with(input, target); });
+  } else if (api == "ends_with") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::ends_with(input, target); });
+  }
+}
 
-STRINGS_BENCHMARK_DEFINE(find)
-STRINGS_BENCHMARK_DEFINE(find_multi)
-STRINGS_BENCHMARK_DEFINE(contains)
-STRINGS_BENCHMARK_DEFINE(starts_with)
-STRINGS_BENCHMARK_DEFINE(ends_with)
+NVBENCH_BENCH(bench_find_string)
+  .set_name("find_string")
+  .add_string_axis("api", {"find", "find_multi", "contains", "starts_with", "ends_with"})
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
+  .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216})
+  .add_int64_axis("hit_rate", {20, 80});  // percentage
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index bbd98c4e9ff..45eba39f413 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -361,14 +361,22 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
   if (d_strings.is_null(str_idx)) { return; }
   // get the string for this warp
   auto const d_str = d_strings.element<string_view>(str_idx);
-  // each thread of the warp will check just part of the string
-  auto found = false;
-  for (auto i = static_cast<size_type>(idx % cudf::detail::warp_size);
+  // each warp processes 4 starting bytes
+  auto constexpr bytes_per_warp = 4;
+  auto found                    = false;
+  for (auto i = lane_idx * bytes_per_warp;
        !found && ((i + d_target.size_bytes()) <= d_str.size_bytes());
-       i += cudf::detail::warp_size) {
+       i += cudf::detail::warp_size * bytes_per_warp) {
     // check the target matches this part of the d_str data
-    if (d_target.compare(d_str.data() + i, d_target.size_bytes()) == 0) { found = true; }
+    // this is definitely faster for very long strings > 128B
+    for (auto j = 0; j < bytes_per_warp; j++) {
+      if (((i + j + d_target.size_bytes()) <= d_str.size_bytes()) &&
+          d_target.compare(d_str.data() + i + j, d_target.size_bytes()) == 0) {
+        found = true;
+      }
+    }
   }
+
   auto const result = warp_reduce(temp_storage).Reduce(found, cub::Max());
   if (lane_idx == 0) { d_results[str_idx] = result; }
 }
@@ -391,12 +399,10 @@ std::unique_ptr<column> contains_warp_parallel(strings_column_view const& input,
 
   // fill the output with `false` unless the `d_target` is empty
   auto results_view = results->mutable_view();
-  thrust::fill(rmm::exec_policy(stream),
-               results_view.begin<bool>(),
-               results_view.end<bool>(),
-               d_target.empty());
-
-  if (!d_target.empty()) {
+  if (d_target.empty()) {
+    thrust::fill(
+      rmm::exec_policy_nosync(stream), results_view.begin<bool>(), results_view.end<bool>(), true);
+  } else {
     // launch warp per string
     auto const d_strings     = column_device_view::create(input.parent(), stream);
     constexpr int block_size = 256;
@@ -461,9 +467,8 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
                     thrust::make_counting_iterator<size_type>(strings_count),
                     d_results,
                     [d_strings, pfn, d_target] __device__(size_type idx) {
-                      if (!d_strings.is_null(idx))
-                        return bool{pfn(d_strings.element<string_view>(idx), d_target)};
-                      return false;
+                      return !d_strings.is_null(idx) &&
+                             bool{pfn(d_strings.element<string_view>(idx), d_target)};
                     });
   results->set_null_count(strings.null_count());
   return results;

From 6d5f9653debe57c7eb52f42fb980d38451a9a460 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 16 May 2024 13:25:47 -0700
Subject: [PATCH 239/272] Fix `chunked_parquet_reader` behavior when input has
 no more rows to read (#15757)

Fixes #15743

This PR solves two problems.

First, it does not any longer throw a CUDA failure or exception when an invalid (out of bound) chunk is read via `chunked_parquet_reader::read_chunk()` and instead returns an empty chunk.

Second, for empty tables, it returns true for `has_next()` until the first call to `chunked_parquet_reader::read_chunk()`. After that `has_next()` returns false but `chunked_parquet_reader::read_chunk()` keeps returning empty chunks

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/15757
---
 cpp/include/cudf/io/detail/parquet.hpp      |  7 +++
 cpp/src/io/parquet/reader_impl.cpp          | 22 ++++---
 cpp/src/io/parquet/reader_impl.hpp          | 12 ++++
 cpp/tests/io/parquet_chunked_reader_test.cu | 68 +++++++++++++++++++++
 4 files changed, 102 insertions(+), 7 deletions(-)

diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 55338d422ad..fcf5f0d9290 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -102,6 +102,13 @@ class chunked_reader : private reader {
    *    // Process chunk
    *  } while (reader.has_next());
    *
+   * // Alternatively
+   *
+   *  while (reader.has_next()) {
+   *    auto const chunk = reader.read_chunk();
+   *    // Process chunk
+   *  }
+   *
    * ```
    *
    * If `chunk_read_limit == 0` (i.e., no output limit), and `pass_read_limit == 0` (no input
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index b0d19ad00f3..fba95093c9c 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -476,8 +476,10 @@ void reader::impl::prepare_data(int64_t skip_rows,
   }
 
   // handle any chunking work (ratcheting through the subpasses and chunks within
-  // our current pass)
-  if (_file_itm_data.num_passes() > 0) { handle_chunking(uses_custom_row_bounds); }
+  // our current pass) if in bounds
+  if (_file_itm_data._current_input_pass < _file_itm_data.num_passes()) {
+    handle_chunking(uses_custom_row_bounds);
+  }
 }
 
 void reader::impl::populate_metadata(table_metadata& out_metadata)
@@ -569,14 +571,16 @@ table_with_metadata reader::impl::finalize_output(
     _output_metadata = std::make_unique<table_metadata>(out_metadata);
   }
 
-  // advance output chunk/subpass/pass info
-  if (_file_itm_data.num_passes() > 0) {
+  // advance output chunk/subpass/pass info for non-empty tables if and only if we are in bounds
+  if (_file_itm_data._current_input_pass < _file_itm_data.num_passes()) {
     auto& pass    = *_pass_itm_data;
     auto& subpass = *pass.subpass;
     subpass.current_output_chunk++;
-    _file_itm_data._output_chunk_count++;
   }
 
+  // increment the output chunk count
+  _file_itm_data._output_chunk_count++;
+
   if (filter.has_value()) {
     auto read_table = std::make_unique<table>(std::move(out_columns));
     auto predicate  = cudf::detail::compute_column(
@@ -616,7 +620,8 @@ table_with_metadata reader::impl::read_chunk()
 {
   // Reset the output buffers to their original states (right after reader construction).
   // Don't need to do it if we read the file all at once.
-  if (_file_itm_data._output_chunk_count > 0) {
+  if (_file_itm_data._current_input_pass < _file_itm_data.num_passes() and
+      not is_first_output_chunk()) {
     _output_buffers.resize(0);
     for (auto const& buff : _output_buffers_template) {
       _output_buffers.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff));
@@ -628,6 +633,7 @@ table_with_metadata reader::impl::read_chunk()
                true /*uses_custom_row_bounds*/,
                {} /*row_group_indices, empty means read all row groups*/,
                std::nullopt /*filter*/);
+
   return read_chunk_internal(true, std::nullopt);
 }
 
@@ -641,7 +647,9 @@ bool reader::impl::has_next()
 
   // current_input_pass will only be incremented to be == num_passes after
   // the last chunk in the last subpass in the last pass has been returned
-  return has_more_work();
+  // if not has_more_work then check if this is the first pass in an empty
+  // table and return true so it could be read once.
+  return has_more_work() or is_first_output_chunk();
 }
 
 namespace {
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index b67d2e312d7..04da8eed591 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -90,6 +90,13 @@ class reader::impl {
    *    // Process chunk
    *  } while (reader.has_next());
    *
+   * // Alternatively
+   *
+   *  while (reader.has_next()) {
+   *    auto const chunk = reader.read_chunk();
+   *    // Process chunk
+   *  }
+   *
    * ```
    *
    * Reading the whole given file at once through `read()` function is still supported if
@@ -347,6 +354,11 @@ class reader::impl {
   }
 
  private:
+  [[nodiscard]] bool is_first_output_chunk() const
+  {
+    return _file_itm_data._output_chunk_count == 0;
+  }
+
   rmm::cuda_stream_view _stream;
   rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
 
diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu
index b3f3fac5a3d..cff85647725 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cu
+++ b/cpp/tests/io/parquet_chunked_reader_test.cu
@@ -1409,3 +1409,71 @@ TEST_F(ParquetChunkedReaderInputLimitTest, Mixed)
   constexpr int expected_c[] = {20, 18, 15, 12};
   input_limit_test_read(test_filenames, tbl, 32 * 1024 * 1024, 64 * 1024 * 1024, expected_c);
 }
+
+TEST_F(ParquetChunkedReaderTest, TestChunkedReadOutOfBoundChunks)
+{
+  auto const generate_input = [](int num_rows, bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const value_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release());
+    input_columns.emplace_back(int64s_col(value_iter, value_iter + num_rows).release());
+
+    auto filename = "chunked_out_of_bounds_" + std::to_string(num_rows);
+
+    return write_file(input_columns, filename, nullable, false);
+  };
+
+  auto const read_chunks_with_while_loop = [](cudf::io::chunked_parquet_reader const& reader) {
+    auto out_tables = std::vector<std::unique_ptr<cudf::table>>{};
+    int num_chunks  = 0;
+    // should always be true
+    EXPECT_EQ(reader.has_next(), true);
+    while (reader.has_next()) {
+      out_tables.emplace_back(reader.read_chunk().tbl);
+      num_chunks++;
+    }
+    auto out_tviews = std::vector<cudf::table_view>{};
+    for (auto const& tbl : out_tables) {
+      out_tviews.emplace_back(tbl->view());
+    }
+
+    return std::pair(cudf::concatenate(out_tviews), num_chunks);
+  };
+
+  // empty table to compare with the out of bound chunks
+  auto const empty_table = generate_input(0, false).first;
+
+  {
+    auto constexpr num_rows          = 0;
+    auto const [expected, filepath]  = generate_input(num_rows, false);
+    auto constexpr output_read_limit = 1'000;
+    auto const options =
+      cudf::io::parquet_reader_options_builder(cudf::io::source_info{filepath}).build();
+    auto const reader =
+      cudf::io::chunked_parquet_reader(output_read_limit, 0, options, cudf::get_default_stream());
+    auto const [result, num_chunks]     = read_chunks_with_while_loop(reader);
+    auto const out_of_bound_table_chunk = reader.read_chunk().tbl;
+
+    EXPECT_EQ(num_chunks, 1);
+    EXPECT_EQ(reader.has_next(), false);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*out_of_bound_table_chunk, *empty_table);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  {
+    auto constexpr num_rows          = 40'000;
+    auto constexpr output_read_limit = 240'000;
+    auto const [expected, filepath]  = generate_input(num_rows, false);
+    auto const options =
+      cudf::io::parquet_reader_options_builder(cudf::io::source_info{filepath}).build();
+    auto const reader =
+      cudf::io::chunked_parquet_reader(output_read_limit, 0, options, cudf::get_default_stream());
+    auto const [result, num_chunks]     = read_chunks_with_while_loop(reader);
+    auto const out_of_bound_table_chunk = reader.read_chunk().tbl;
+
+    EXPECT_EQ(num_chunks, 2);
+    EXPECT_EQ(reader.has_next(), false);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*out_of_bound_table_chunk, *empty_table);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}

From d10b8e4c9b437377cb6d231873e8f0fe9f8dc817 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 17 May 2024 11:55:21 -0500
Subject: [PATCH 240/272] Handle mixed-like homogeneous types in `isin`
 (#15771)

Fixes: #15768

There is a possibility that a host array can have `object` type but contain all values of a homogeneous type, this still cannot be supported by column constructors because `cudf` doesn't have a true `object` types, hence this PR introduces a workaround for this problem in `isin` API.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15771
---
 python/cudf/cudf/core/column/numerical.py     | 29 ++++++++++++++++---
 python/dask_cudf/dask_cudf/tests/test_core.py | 22 ++++++++++++++
 2 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 12c27ed0bc1..bab862f775f 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -38,6 +38,7 @@
 )
 from cudf.core.dtypes import CategoricalDtype
 from cudf.core.mixins import BinaryOperand
+from cudf.errors import MixedTypeError
 from cudf.utils.dtypes import (
     min_column_type,
     min_signed_type,
@@ -404,10 +405,30 @@ def _process_values_for_isin(
         self, values: Sequence
     ) -> Tuple[ColumnBase, ColumnBase]:
         lhs = cast("cudf.core.column.ColumnBase", self)
-        rhs = as_column(values, nan_as_null=False)
-
-        if isinstance(rhs, NumericalColumn):
-            rhs = rhs.astype(dtype=self.dtype)
+        try:
+            rhs = as_column(values, nan_as_null=False)
+        except (MixedTypeError, TypeError) as e:
+            # There is a corner where `values` can be of `object` dtype
+            # but have values of homogeneous type.
+            inferred_dtype = cudf.api.types.infer_dtype(values)
+            if (
+                self.dtype.kind in {"i", "u"} and inferred_dtype == "integer"
+            ) or (
+                self.dtype.kind == "f"
+                and inferred_dtype in {"floating", "integer"}
+            ):
+                rhs = as_column(values, nan_as_null=False, dtype=self.dtype)
+            elif self.dtype.kind == "f" and inferred_dtype == "integer":
+                rhs = as_column(values, nan_as_null=False, dtype="int")
+            elif (
+                self.dtype.kind in {"i", "u"} and inferred_dtype == "floating"
+            ):
+                rhs = as_column(values, nan_as_null=False, dtype="float")
+            else:
+                raise e
+        else:
+            if isinstance(rhs, NumericalColumn):
+                rhs = rhs.astype(dtype=self.dtype)
 
         if lhs.null_count == len(lhs):
             lhs = lhs.astype(rhs.dtype)
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 981c2c369f1..18a9e3b496f 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -971,3 +971,25 @@ def func(x):
     # NOTE: The calculation here doesn't need to make sense.
     # We just need to make sure we get the right type back.
     assert type(result) == type(expect)
+
+
+@pytest.mark.parametrize("data", [[1, 2, 3], [1.1, 2.3, 4.5]])
+@pytest.mark.parametrize("values", [[1, 5], [1.1, 2.4, 2.3]])
+def test_series_isin(data, values):
+    ser = cudf.Series(data)
+    pddf = dd.from_pandas(ser.to_pandas(), 1)
+    ddf = dask_cudf.from_cudf(ser, 1)
+
+    actual = ddf.isin(values)
+    expected = pddf.isin(values)
+
+    dd.assert_eq(actual, expected)
+
+
+def test_series_isin_error():
+    ser = cudf.Series([1, 2, 3])
+    ddf = dask_cudf.from_cudf(ser, 1)
+    with pytest.raises(TypeError):
+        ser.isin([1, 5, "a"])
+    with pytest.raises(TypeError):
+        ddf.isin([1, 5, "a"]).compute()

From e6e67615c248d4992d0bf2ce5a47b09534cd4c82 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Fri, 17 May 2024 19:52:26 -0400
Subject: [PATCH 241/272] Eagerly populate the class dict for cudf.pandas proxy
 types (#14534)

Rather than dynamically looking up class attributes (and methods), this PR makes it so that we eagerly populate the class with all known methods and attributes (by inspecting the "slow" class).

This solves a number of problems:

- it makes `getattr` trivially inexpensive (no dynamic `__getattr__` for each attribute access)
- it ensures the _same_ object is returned every time you do, e.g., `DataFrame.max`
- it makes tab completion fast because the attributes don't have to be computed each time
- it no longer exposes attributes that are specific to cuDF - for example `Series.list`
- it allows subclassing of proxy types to work better. For example, derived types can now call `super().` to access attributes of base types

Authors:
  - Ashwin Srinath (https://github.com/shwina)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/14534
---
 docs/cudf/source/cudf_pandas/faq.md           |  12 -
 python/cudf/cudf/pandas/_wrappers/common.py   |   8 +-
 python/cudf/cudf/pandas/_wrappers/numpy.py    |   4 +-
 python/cudf/cudf/pandas/_wrappers/pandas.py   | 117 ++++++-
 python/cudf/cudf/pandas/fast_slow_proxy.py    | 297 +++++++++++-------
 python/cudf/cudf/pandas/profiler.py           |   7 +-
 .../cudf/pandas/scripts/run-pandas-tests.sh   |   2 +-
 .../cudf_pandas_tests/test_cudf_pandas.py     |  32 ++
 .../cudf/cudf_pandas_tests/test_profiler.py   |   4 +-
 9 files changed, 326 insertions(+), 157 deletions(-)

diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md
index dde7afb1360..55976740105 100644
--- a/docs/cudf/source/cudf_pandas/faq.md
+++ b/docs/cudf/source/cudf_pandas/faq.md
@@ -151,15 +151,3 @@ for testing or benchmarking purposes. To do so, set the
 ```bash
 CUDF_PANDAS_FALLBACK_MODE=1 python -m cudf.pandas some_script.py
 ```
-
-## Slow tab completion in IPython?
-
-You may experience slow tab completion when inspecting the
-methods/attributes of large dataframes. We expect this issue to be
-resolved in an upcoming release. In the mean time, you may execute the
-following command in IPython before loading `cudf.pandas` to work
-around the issue:
-
-```
-%config IPCompleter.jedi_compute_type_timeout=0
-```
diff --git a/python/cudf/cudf/pandas/_wrappers/common.py b/python/cudf/cudf/pandas/_wrappers/common.py
index 1669882631b..468c5687c15 100644
--- a/python/cudf/cudf/pandas/_wrappers/common.py
+++ b/python/cudf/cudf/pandas/_wrappers/common.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -17,9 +17,9 @@ def array_method(self: _FastSlowProxy, *args, **kwargs):
 
 def array_function_method(self, func, types, args, kwargs):
     try:
-        return _FastSlowAttribute("__array_function__").__get__(self)(
-            func, types, args, kwargs
-        )
+        return _FastSlowAttribute("__array_function__").__get__(
+            self, type(self)
+        )(func, types, args, kwargs)
     except Exception:
         # if something went wrong with __array_function__ we
         # attempt to call the function directly on the slow
diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py
index 9955550ef90..94298872213 100644
--- a/python/cudf/cudf/pandas/_wrappers/numpy.py
+++ b/python/cudf/cudf/pandas/_wrappers/numpy.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -10,6 +10,7 @@
 import numpy.core.multiarray
 
 from ..fast_slow_proxy import (
+    _FastSlowAttribute,
     make_final_proxy_type,
     make_intermediate_proxy_type,
 )
@@ -122,6 +123,7 @@ def wrap_ndarray(cls, arr: cupy.ndarray | numpy.ndarray, constructor):
         "__iter__": custom_iter,
         # Special wrapping to handle scalar values
         "_fsproxy_wrap": classmethod(wrap_ndarray),
+        "base": _FastSlowAttribute("base", private=True),
     },
 )
 
diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index de92cce8ebb..29aaaac245d 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -107,14 +107,16 @@ class _AccessorAttr:
     """
 
     def __init__(self, typ):
-        self.__typ = typ
+        self._typ = typ
+
+    def __set_name__(self, owner, name):
+        self._name = name
 
     def __get__(self, obj, cls=None):
         if obj is None:
-            return self.__typ
+            return self._typ
         else:
-            # allow __getattr__ to handle this
-            raise AttributeError()
+            return _FastSlowAttribute(self._name).__get__(obj, type(obj))
 
 
 def Timestamp_Timedelta__new__(cls, *args, **kwargs):
@@ -214,6 +216,7 @@ def _DataFrame__dir__(self):
         "__dir__": _DataFrame__dir__,
         "_constructor": _FastSlowAttribute("_constructor"),
         "_constructor_sliced": _FastSlowAttribute("_constructor_sliced"),
+        "_accessors": set(),
     },
 )
 
@@ -236,6 +239,7 @@ def _DataFrame__dir__(self):
         "cat": _AccessorAttr(_CategoricalAccessor),
         "_constructor": _FastSlowAttribute("_constructor"),
         "_constructor_expanddim": _FastSlowAttribute("_constructor_expanddim"),
+        "_accessors": set(),
     },
 )
 
@@ -273,6 +277,9 @@ def Index__new__(cls, *args, **kwargs):
         "__new__": Index__new__,
         "_constructor": _FastSlowAttribute("_constructor"),
         "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
+        "_accessors": set(),
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
     },
 )
 
@@ -337,7 +344,11 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=lambda fast: fast.to_pandas(),
     slow_to_fast=cudf.from_pandas,
     bases=(Index,),
-    additional_attributes={"__init__": _DELETE},
+    additional_attributes={
+        "__init__": _DELETE,
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+    },
 )
 
 DatetimeArray = make_final_proxy_type(
@@ -346,6 +357,10 @@ def Index__new__(cls, *args, **kwargs):
     pd.arrays.DatetimeArray,
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
+    additional_attributes={
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+    },
 )
 
 DatetimeTZDtype = make_final_proxy_type(
@@ -364,7 +379,11 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=lambda fast: fast.to_pandas(),
     slow_to_fast=cudf.from_pandas,
     bases=(Index,),
-    additional_attributes={"__init__": _DELETE},
+    additional_attributes={
+        "__init__": _DELETE,
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+    },
 )
 
 NumpyExtensionArray = make_final_proxy_type(
@@ -385,6 +404,10 @@ def Index__new__(cls, *args, **kwargs):
     pd.arrays.TimedeltaArray,
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
+    additional_attributes={
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+    },
 )
 
 PeriodIndex = make_final_proxy_type(
@@ -394,7 +417,11 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
     bases=(Index,),
-    additional_attributes={"__init__": _DELETE},
+    additional_attributes={
+        "__init__": _DELETE,
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+    },
 )
 
 PeriodArray = make_final_proxy_type(
@@ -403,6 +430,11 @@ def Index__new__(cls, *args, **kwargs):
     pd.arrays.PeriodArray,
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
+    additional_attributes={
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
+    },
 )
 
 PeriodDtype = make_final_proxy_type(
@@ -464,6 +496,10 @@ def Index__new__(cls, *args, **kwargs):
     pd.arrays.StringArray,
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
+    additional_attributes={
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+    },
 )
 
 StringDtype = make_final_proxy_type(
@@ -472,7 +508,10 @@ def Index__new__(cls, *args, **kwargs):
     pd.StringDtype,
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
-    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+    additional_attributes={
+        "__hash__": _FastSlowAttribute("__hash__"),
+        "storage": _FastSlowAttribute("storage"),
+    },
 )
 
 BooleanArray = make_final_proxy_type(
@@ -482,7 +521,9 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
     additional_attributes={
-        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__")
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
     },
 )
 
@@ -502,7 +543,9 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
     additional_attributes={
-        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__")
+        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
     },
 )
 
@@ -586,7 +629,11 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=lambda fast: fast.to_pandas(),
     slow_to_fast=cudf.from_pandas,
     bases=(Index,),
-    additional_attributes={"__init__": _DELETE},
+    additional_attributes={
+        "__init__": _DELETE,
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+    },
 )
 
 IntervalArray = make_final_proxy_type(
@@ -595,6 +642,10 @@ def Index__new__(cls, *args, **kwargs):
     pd.arrays.IntervalArray,
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
+    additional_attributes={
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+    },
 )
 
 IntervalDtype = make_final_proxy_type(
@@ -622,7 +673,9 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
     additional_attributes={
-        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__")
+        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
     },
 )
 
@@ -798,6 +851,14 @@ def Index__new__(cls, *args, **kwargs):
         pd_Styler,
         fast_to_slow=_Unusable(),
         slow_to_fast=_Unusable(),
+        additional_attributes={
+            "css": _FastSlowAttribute("css"),
+            "ctx": _FastSlowAttribute("ctx"),
+            "index": _FastSlowAttribute("ctx"),
+            "data": _FastSlowAttribute("data"),
+            "_display_funcs": _FastSlowAttribute("_display_funcs"),
+            "table_styles": _FastSlowAttribute("table_styles"),
+        },
     )
 except ImportError:
     # Styler requires Jinja to be installed
@@ -813,7 +874,7 @@ def _get_eval_locals_and_globals(level, local_dict=None, global_dict=None):
     return local_dict, global_dict
 
 
-@register_proxy_func(pd.eval)
+@register_proxy_func(pd.core.computation.eval.eval)
 @nvtx.annotate(
     "CUDF_PANDAS_EVAL",
     color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"],
@@ -843,6 +904,24 @@ def _eval(
     )
 
 
+_orig_df_eval_method = DataFrame.eval
+
+
+@register_proxy_func(pd.core.accessor.register_dataframe_accessor)
+def _register_dataframe_accessor(name):
+    return pd.core.accessor._register_accessor(name, DataFrame)
+
+
+@register_proxy_func(pd.core.accessor.register_series_accessor)
+def _register_series_accessor(name):
+    return pd.core.accessor._register_accessor(name, Series)
+
+
+@register_proxy_func(pd.core.accessor.register_index_accessor)
+def _register_index_accessor(name):
+    return pd.core.accessor._register_accessor(name, Index)
+
+
 @nvtx.annotate(
     "CUDF_PANDAS_DATAFRAME_EVAL",
     color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"],
@@ -853,11 +932,14 @@ def _df_eval_method(self, *args, local_dict=None, global_dict=None, **kwargs):
     local_dict, global_dict = _get_eval_locals_and_globals(
         level, local_dict, global_dict
     )
-    return super(type(self), self).__getattr__("eval")(
-        *args, local_dict=local_dict, global_dict=global_dict, **kwargs
+    return _orig_df_eval_method(
+        self, *args, local_dict=local_dict, global_dict=global_dict, **kwargs
     )
 
 
+_orig_query_eval_method = DataFrame.query
+
+
 @nvtx.annotate(
     "CUDF_PANDAS_DATAFRAME_QUERY",
     color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"],
@@ -870,8 +952,8 @@ def _df_query_method(self, *args, local_dict=None, global_dict=None, **kwargs):
     local_dict, global_dict = _get_eval_locals_and_globals(
         level, local_dict, global_dict
     )
-    return super(type(self), self).__getattr__("query")(
-        *args, local_dict=local_dict, global_dict=global_dict, **kwargs
+    return _orig_query_eval_method(
+        self, *args, local_dict=local_dict, global_dict=global_dict, **kwargs
     )
 
 
@@ -1277,6 +1359,7 @@ def holiday_calendar_factory_wrapper(*args, **kwargs):
     additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
 )
 
+
 MonthBegin = make_final_proxy_type(
     "MonthBegin",
     _Unusable,
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index e5c86d2318e..94caec1ce6c 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -83,6 +83,9 @@ def __getattribute__(self, name: str) -> Any:
             return super().__getattribute__(name)
         raise TypeError("Unusable type. Falling back to the slow object")
 
+    def __repr__(self) -> str:
+        raise AttributeError("Unusable type. Falling back to the slow object")
+
 
 class _PickleConstructor:
     """A pickleable object to support construction in __reduce__.
@@ -231,6 +234,13 @@ def _fsproxy_state(self) -> _State:
         elif v is not _DELETE:
             cls_dict[k] = v
 
+    for slow_name in dir(slow_type):
+        if slow_name in cls_dict or slow_name.startswith("__"):
+            continue
+        else:
+            cls_dict[slow_name] = _FastSlowAttribute(
+                slow_name, private=slow_name.startswith("_")
+            )
     if meta_class is None:
         meta_class = _FastSlowProxyMeta
     else:
@@ -329,11 +339,26 @@ def _fsproxy_fast_to_slow(self):
         "_fsproxy_fast_to_slow": _fsproxy_fast_to_slow,
         "_fsproxy_state": _fsproxy_state,
     }
-
     for method in _SPECIAL_METHODS:
         if getattr(slow_type, method, False):
             cls_dict[method] = _FastSlowAttribute(method)
 
+    for slow_name in dir(slow_type):
+        if slow_name in cls_dict or slow_name.startswith("__"):
+            continue
+        else:
+            cls_dict[slow_name] = _FastSlowAttribute(
+                slow_name, private=slow_name.startswith("_")
+            )
+
+    for slow_name in getattr(slow_type, "_attributes", []):
+        if slow_name in cls_dict:
+            continue
+        else:
+            cls_dict[slow_name] = _FastSlowAttribute(
+                slow_name, private=slow_name.startswith("_")
+            )
+
     cls = types.new_class(
         name,
         (_IntermediateProxy,),
@@ -411,62 +436,16 @@ def _raise_attribute_error(obj, name):
     raise AttributeError(f"'{obj}' object has no attribute '{name}'")
 
 
-class _FastSlowAttribute:
-    """
-    A descriptor type used to define attributes of fast-slow proxies.
-    """
-
-    def __init__(self, name: str):
-        self._name = name
-
-    def __get__(self, obj, owner=None) -> Any:
-        if obj is None:
-            # class attribute
-            obj = owner
-
-        if not (
-            isinstance(obj, _FastSlowProxy)
-            or issubclass(type(obj), _FastSlowProxyMeta)
-        ):
-            # we only want to look up attributes on the underlying
-            # fast/slow objects for instances of _FastSlowProxy or
-            # subtypes of _FastSlowProxyMeta:
-            _raise_attribute_error(owner if owner else obj, self._name)
-
-        result, _ = _fast_slow_function_call(getattr, obj, self._name)
-
-        if isinstance(result, functools.cached_property):
-            # TODO: temporary workaround until dask is able
-            # to correctly inspect cached_property objects.
-            # GH: 264
-            result = property(result.func)
-
-        if isinstance(result, (_MethodProxy, property)):
-            from .module_accelerator import disable_module_accelerator
-
-            type_ = owner if owner else type(obj)
-            slow_result_type = getattr(type_._fsproxy_slow, self._name)
-            with disable_module_accelerator():
-                result.__doc__ = inspect.getdoc(  # type: ignore
-                    slow_result_type
-                )
-
-            if isinstance(result, _MethodProxy):
-                # Note that this will produce the wrong result for bound
-                # methods because dir for the method won't be the same as for
-                # the pure unbound function, but the alternative is
-                # materializing the slow object when we don't really want to.
-                result._fsproxy_slow_dir = dir(slow_result_type)  # type: ignore
-
-        return result
-
-
 class _FastSlowProxyMeta(type):
     """
     Metaclass used to dynamically find class attributes and
     classmethods of fast-slow proxy types.
     """
 
+    _fsproxy_slow_dir: list
+    _fsproxy_slow_type: type
+    _fsproxy_fast_type: type
+
     @property
     def _fsproxy_slow(self) -> type:
         return self._fsproxy_slow_type
@@ -483,15 +462,6 @@ def __dir__(self):
         except AttributeError:
             return type.__dir__(self)
 
-    def __getattr__(self, name: str) -> Any:
-        if name.startswith("_fsproxy") or name.startswith("__"):
-            # an AttributeError was raised when trying to evaluate
-            # an internal attribute, we just need to propagate this
-            _raise_attribute_error(self.__class__.__name__, name)
-
-        attr = _FastSlowAttribute(name)
-        return attr.__get__(None, owner=self)
-
     def __subclasscheck__(self, __subclass: type) -> bool:
         if super().__subclasscheck__(__subclass):
             return True
@@ -565,56 +535,13 @@ def __dir__(self):
         except AttributeError:
             return object.__dir__(self)
 
-    def __getattr__(self, name: str) -> Any:
-        if name.startswith("_fsproxy"):
-            # an AttributeError was raised when trying to evaluate
-            # an internal attribute, we just need to propagate this
-            _raise_attribute_error(self.__class__.__name__, name)
-        if name in {
-            "_ipython_canary_method_should_not_exist_",
-            "_ipython_display_",
-            "_repr_mimebundle_",
-            # Workaround for https://github.com/numpy/numpy/issues/5350
-            # see GH:216 for details
-            "__array_struct__",
-        }:
-            # IPython always looks for these names in its display
-            # logic. See #GH:70 and #GH:172 for more details but the
-            # gist is that not raising an AttributeError immediately
-            # results in slow display in IPython (since the fast
-            # object will be copied to the slow one to look for
-            # attributes there which then also won't exist).
-            # This is somewhat delicate to the order in which IPython
-            # implements special display fallbacks.
-            _raise_attribute_error(self.__class__.__name__, name)
-        if name.startswith("_"):
-            # private attributes always come from `._fsproxy_slow`:
-            obj = getattr(self._fsproxy_slow, name)
-            if name.startswith("__array"):
-                # TODO: numpy methods raise when given proxy ndarray objects
-                # https://numpy.org/doc/stable/reference/arrays.classes.html#special-attributes-and-methods  # noqa:E501
-                return obj
-
-            if not _is_function_or_method(obj):
-                return _maybe_wrap_result(
-                    obj, getattr, self._fsproxy_slow, name
-                )
-
-            @functools.wraps(obj)
-            def _wrapped_private_slow(*args, **kwargs):
-                slow_args, slow_kwargs = _slow_arg(args), _slow_arg(kwargs)
-                result = obj(*slow_args, **slow_kwargs)
-                return _maybe_wrap_result(result, obj, *args, **kwargs)
-
-            return _wrapped_private_slow
-        attr = _FastSlowAttribute(name)
-        return attr.__get__(self)
-
     def __setattr__(self, name, value):
         if name.startswith("_"):
             object.__setattr__(self, name, value)
             return
-        return _FastSlowAttribute("__setattr__").__get__(self)(name, value)
+        return _FastSlowAttribute("__setattr__").__get__(self, type(self))(
+            name, value
+        )
 
 
 class _FinalProxy(_FastSlowProxy):
@@ -790,17 +717,162 @@ class _FunctionProxy(_CallableProxyMixin):
 
     __name__: str
 
-    def __init__(self, fast: Callable | _Unusable, slow: Callable):
+    def __init__(
+        self,
+        fast: Callable | _Unusable,
+        slow: Callable,
+        *,
+        assigned=None,
+        updated=None,
+    ):
         self._fsproxy_fast = fast
         self._fsproxy_slow = slow
-        functools.update_wrapper(self, slow)
+        if assigned is None:
+            assigned = functools.WRAPPER_ASSIGNMENTS
+        if updated is None:
+            updated = functools.WRAPPER_UPDATES
+        functools.update_wrapper(
+            self,
+            slow,
+            assigned=assigned,
+            updated=updated,
+        )
 
+    def __reduce__(self):
+        """
+        In conjunction with `__proxy_setstate__`, this effectively enables
+        proxy types to be pickled and unpickled by pickling and unpickling
+        the underlying wrapped types.
+        """
+        # Need a local import to avoid circular import issues
+        from .module_accelerator import disable_module_accelerator
+
+        with disable_module_accelerator():
+            pickled_fast = pickle.dumps(self._fsproxy_fast)
+            pickled_slow = pickle.dumps(self._fsproxy_slow)
+        return (
+            _PickleConstructor(type(self)),
+            (),
+            (pickled_fast, pickled_slow),
+        )
 
-class _MethodProxy(_CallableProxyMixin, _IntermediateProxy):
+    def __setstate__(self, state):
+        # Need a local import to avoid circular import issues
+        from .module_accelerator import disable_module_accelerator
+
+        with disable_module_accelerator():
+            unpickled_fast = pickle.loads(state[0])
+            unpickled_slow = pickle.loads(state[1])
+        self._fsproxy_fast = unpickled_fast
+        self._fsproxy_slow = unpickled_slow
+
+
+def is_bound_method(obj):
+    return inspect.ismethod(obj) and not inspect.isfunction(obj)
+
+
+def is_function(obj):
+    return inspect.isfunction(obj) or isinstance(obj, types.FunctionType)
+
+
+class _FastSlowAttribute:
     """
-    Methods of fast-slow proxies are of type _MethodProxy.
+    A descriptor type used to define attributes of fast-slow proxies.
     """
 
+    _attr: Any
+
+    def __init__(self, name: str, *, private: bool = False):
+        self._name = name
+        self._private = private
+        self._attr = None
+        self._doc = None
+        self._dir = None
+
+    def __get__(self, instance, owner) -> Any:
+        from .module_accelerator import disable_module_accelerator
+
+        if self._attr is None:
+            if self._private:
+                fast_attr = _Unusable()
+            else:
+                fast_attr = getattr(
+                    owner._fsproxy_fast, self._name, _Unusable()
+                )
+
+            try:
+                slow_attr = getattr(owner._fsproxy_slow, self._name)
+            except AttributeError as e:
+                if instance is not None:
+                    return _maybe_wrap_result(
+                        getattr(instance._fsproxy_slow, self._name),
+                        None,  # type: ignore
+                    )
+                else:
+                    raise e
+
+            if _is_function_or_method(slow_attr):
+                self._attr = _MethodProxy(fast_attr, slow_attr)
+            else:
+                # for anything else, use a fast-slow attribute:
+                self._attr, _ = _fast_slow_function_call(
+                    getattr, owner, self._name
+                )
+
+                if isinstance(
+                    self._attr, (property, functools.cached_property)
+                ):
+                    with disable_module_accelerator():
+                        self._attr.__doc__ = inspect.getdoc(slow_attr)
+
+        if instance is not None:
+            if isinstance(self._attr, _MethodProxy):
+                if is_bound_method(self._attr._fsproxy_slow):
+                    return self._attr
+                else:
+                    return types.MethodType(self._attr, instance)
+            else:
+                if self._private:
+                    return _maybe_wrap_result(
+                        getattr(instance._fsproxy_slow, self._name),
+                        None,  # type: ignore
+                    )
+                return _fast_slow_function_call(getattr, instance, self._name)[
+                    0
+                ]
+        return self._attr
+
+
+class _MethodProxy(_FunctionProxy):
+    def __init__(self, fast, slow):
+        super().__init__(
+            fast,
+            slow,
+            updated=functools.WRAPPER_UPDATES,
+            assigned=(
+                tuple(filter(lambda x: x != "__name__", _WRAPPER_ASSIGNMENTS))
+            ),
+        )
+
+    def __dir__(self):
+        return self._fsproxy_slow.__dir__()
+
+    @property
+    def __doc__(self):
+        return self._fsproxy_slow.__doc__
+
+    @property
+    def __name__(self):
+        return self._fsproxy_slow.__name__
+
+    @__name__.setter
+    def __name__(self, value):
+        try:
+            setattr(self._fsproxy_fast, "__name__", value)
+        except AttributeError:
+            pass
+        setattr(self._fsproxy_slow, "__name__", value)
+
 
 def _fast_slow_function_call(func: Callable, /, *args, **kwargs) -> Any:
     """
@@ -981,10 +1053,6 @@ def _maybe_wrap_result(result: Any, func: Callable, /, *args, **kwargs) -> Any:
             return type(result)(wrapped)
     elif isinstance(result, Iterator):
         return (_maybe_wrap_result(r, lambda x: x, r) for r in result)
-    elif _is_function_or_method(result):
-        return _MethodProxy._fsproxy_wrap(
-            result, method_chain=(func, args, kwargs)
-        )
     else:
         return result
 
@@ -1081,6 +1149,7 @@ def _replace_closurevars(
     "__and__",
     "__bool__",
     "__call__",
+    "__getattr__",
     "__complex__",
     "__contains__",
     "__copy__",
diff --git a/python/cudf/cudf/pandas/profiler.py b/python/cudf/cudf/pandas/profiler.py
index 0124d411e3b..0dbd333ce4f 100644
--- a/python/cudf/cudf/pandas/profiler.py
+++ b/python/cudf/cudf/pandas/profiler.py
@@ -127,12 +127,7 @@ def get_namespaced_function_name(
         ],
     ):
         if isinstance(func_obj, _MethodProxy):
-            # Extract classname from method object
-            type_name = type(func_obj._fsproxy_wrapped.__self__).__name__
-            # Explicitly ask for __name__ on _fsproxy_wrapped to avoid
-            # getting a private attribute and forcing a slow-path copy
-            func_name = func_obj._fsproxy_wrapped.__name__
-            return ".".join([type_name, func_name])
+            return func_obj._fsproxy_slow.__qualname__
         elif isinstance(func_obj, _FunctionProxy) or issubclass(
             func_obj, (_FinalProxy, _IntermediateProxy)
         ):
diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 6eb28104120..cd9f90d50fe 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -138,7 +138,7 @@ and not test_eof_states"
 # TODO: Remove "not db" once a postgres & mysql container is set up on the CI
 PANDAS_CI="1" timeout 30m python -m pytest -p cudf.pandas \
     -v -m "not single_cpu and not db" \
-    -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS" \
+    -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS and not test_groupby_raises_category_on_category and not test_constructor_no_pandas_array and not test_is_monotonic_na and not test_index_contains and not test_index_contains and not test_frame_op_subclass_nonclass_constructor and not test_round_trip_current" \
     --import-mode=importlib \
     ${PYTEST_IGNORES} \
     "$@" || [ $? = 1 ]  # Exit success if exit code was 1 (permit test failures but not other errors)
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 9fb0891fa52..e3d4f878ad5 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -379,6 +379,8 @@ def test_pickle_round_trip(dataframe):
 
 
 def test_excel_round_trip(dataframe):
+    pytest.importorskip("openpyxl")
+
     pdf, df = dataframe
     excel_pdf = BytesIO()
     excel_cudf_pandas = BytesIO()
@@ -1211,6 +1213,24 @@ def test_func_namespace():
     assert xpd.concat is xpd.core.reshape.concat.concat
 
 
+def test_register_accessor():
+    @xpd.api.extensions.register_dataframe_accessor("xyz")
+    class XYZ:
+        def __init__(self, obj):
+            self._obj = obj
+
+        @property
+        def foo(self):
+            return "spam"
+
+    # the accessor must be registered with the proxy type,
+    # not the underlying fast or slow type
+    assert "xyz" in xpd.DataFrame.__dict__
+
+    df = xpd.DataFrame()
+    assert df.xyz.foo == "spam"
+
+
 def test_pickle_groupby(dataframe):
     pdf, df = dataframe
     pgb = pdf.groupby("a")
@@ -1232,6 +1252,18 @@ def test_isinstance_base_offset():
     assert isinstance(offset, xpd.tseries.offsets.BaseOffset)
 
 
+def test_super_attribute_lookup():
+    # test that we can use super() to access attributes
+    # of the base class when subclassing proxy types
+
+    class Foo(xpd.Series):
+        def max_times_two(self):
+            return super().max() * 2
+
+    s = Foo([1, 2, 3])
+    assert s.max_times_two() == 6
+
+
 def test_floordiv_array_vs_df():
     xarray = xpd.Series([1, 2, 3], dtype="datetime64[ns]").array
     parray = pd.Series([1, 2, 3], dtype="datetime64[ns]").array
diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py
index 359a2a2c515..588398265f2 100644
--- a/python/cudf/cudf_pandas_tests/test_profiler.py
+++ b/python/cudf/cudf_pandas_tests/test_profiler.py
@@ -33,11 +33,11 @@ def test_profiler():
         "Timestamp",
         "DataFrame",
         "DataFrame.groupby",
-        "DataFrameGroupBy.sum",
+        "GroupBy.sum",
         "DataFrame.sum",
         "Series.__getitem__",
         "Timedelta",
-        "Timestamp.__add__",
+        "_Timestamp.__add__",
     }
     for name, func in per_function_stats.items():
         assert (

From 9ce1721567ccb599fbf2efc8ec770d45b57ddef8 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Mon, 20 May 2024 12:12:45 -0400
Subject: [PATCH 242/272] Reading multi-line JSON in string columns using
 runtime configurable delimiter (#15556)

Addresses #15277
Given a JSON lines buffer with records separated by a delimiter passed at runtime, the idea is to modify the JSON tokenization FST to consider the passed delimiter to generate EOL token instead of the newline character currently hard-coded.
This PR does not modify the whitespace normalization FST to [strip out unquoted `\n` and `\r`](https://github.com/rapidsai/cudf/issues/14865#issuecomment-1917575436). Whitespace normalization will be handled in follow-up works.
Note that this is not a multi-object JSON reader since we are not using the offsets data in the string column, and hence there is no resetting of the start state at every row offset.

Current status:
- [X] Semantic bracket/brace DFA
- [X] DFA removing excess characters after record in line
- [X] Pushdown automata generating tokens
- [x] Test passing arbitrary delimiter that does not occur in input to the reader

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Elias Stehle (https://github.com/elstehle)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/15556
---
 cpp/include/cudf/io/json.hpp       |  45 ++++
 cpp/src/io/json/nested_json.hpp    |   6 +-
 cpp/src/io/json/nested_json_gpu.cu | 144 +++++++-----
 cpp/tests/io/json_test.cpp         |  78 +++++++
 cpp/tests/io/nested_json_test.cpp  | 352 ++++++++++++++++++++++-------
 5 files changed, 476 insertions(+), 149 deletions(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 7374ffc37e6..aa4bee4fb5e 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -101,6 +101,8 @@ class json_reader_options {
   bool _lines = false;
   // Parse mixed types as a string column
   bool _mixed_types_as_string = false;
+  // Delimiter separating records in JSON lines
+  char _delimiter = '\n';
   // Prune columns on read, selected based on the _dtypes option
   bool _prune_columns = false;
 
@@ -229,6 +231,13 @@ class json_reader_options {
     return base_padding + num_columns * column_bytes;
   }
 
+  /**
+   * @brief Returns delimiter separating records in JSON lines
+   *
+   * @return Delimiter separating records in JSON lines
+   */
+  char get_delimiter() const { return _delimiter; }
+
   /**
    * @brief Whether to read the file as a json object per line.
    *
@@ -340,6 +349,30 @@ class json_reader_options {
    */
   void set_byte_range_size(size_type size) { _byte_range_size = size; }
 
+  /**
+   * @brief Set delimiter separating records in JSON lines
+   *
+   * @param delimiter Delimiter separating records in JSON lines
+   */
+  void set_delimiter(char delimiter)
+  {
+    switch (delimiter) {
+      case '{':
+      case '[':
+      case '}':
+      case ']':
+      case ',':
+      case ':':
+      case '"':
+      case '\'':
+      case '\\':
+      case ' ':
+      case '\t':
+      case '\r': CUDF_FAIL("Unsupported delimiter character.", std::invalid_argument); break;
+    }
+    _delimiter = delimiter;
+  }
+
   /**
    * @brief Set whether to read the file as a json object per line.
    *
@@ -507,6 +540,18 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set delimiter separating records in JSON lines
+   *
+   * @param delimiter Delimiter separating records in JSON lines
+   * @return this for chaining
+   */
+  json_reader_options_builder& delimiter(char delimiter)
+  {
+    options.set_delimiter(delimiter);
+    return *this;
+  }
+
   /**
    * @brief Set whether to read the file as a json object per line.
    *
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 5817a01c21f..e12892a2d50 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -59,8 +59,8 @@ enum class stack_behavior_t : char {
   PushPopWithoutReset,
 
   /// Opening brackets and braces, [, {, push onto the stack, closing brackets and braces, ], }, pop
-  /// from the stack. Newline characters are considered delimiters and therefore reset to an empty
-  /// stack.
+  /// from the stack. Delimiter characters are passed when the stack context is constructed to
+  /// reset to an empty stack.
   ResetOnDelimiter
 };
 
@@ -198,11 +198,13 @@ namespace detail {
  * within the context of a struct, a '[' represents that it is within the context of an array, and a
  * '_' symbol that it is at the root of the JSON.
  * @param[in] stack_behavior Specifies the stack's behavior
+ * @param[in] delimiter Specifies the delimiter to use as separator for JSON lines input
  * @param[in] stream The cuda stream to dispatch GPU kernels to
  */
 void get_stack_context(device_span<SymbolT const> json_in,
                        SymbolT* d_top_of_stack,
                        stack_behavior_t stack_behavior,
+                       SymbolT delimiter,
                        rmm::cuda_stream_view stream);
 
 /**
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 8da1bb3ddfc..b243e4ba006 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -131,12 +131,13 @@ constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NU
  * @brief Function object to map (input_symbol,stack_context) tuples to a symbol group.
  */
 struct SymbolPairToSymbolGroupId {
+  SymbolT delimiter = '\n';
   CUDF_HOST_DEVICE SymbolGroupT operator()(thrust::tuple<SymbolT, StackSymbolT> symbol) const
   {
     auto const input_symbol = thrust::get<0>(symbol);
     auto const stack_symbol = thrust::get<1>(symbol);
     return static_cast<SymbolGroupT>(
-      input_symbol == '\n'
+      input_symbol == delimiter
         ? dfa_symbol_group_id::DELIMITER
         : (stack_symbol == '_' ? dfa_symbol_group_id::ROOT : dfa_symbol_group_id::OTHER));
   }
@@ -331,7 +332,7 @@ enum class dfa_symbol_group_id : uint8_t {
   CLOSING_BRACKET,   ///< Closing bracket SG: ]
   QUOTE_CHAR,        ///< Quote character SG: "
   ESCAPE_CHAR,       ///< Escape character SG: '\'
-  NEWLINE_CHAR,      ///< Newline character SG: '\n'
+  DELIMITER_CHAR,    ///< Delimiter character SG
   OTHER_SYMBOLS,     ///< SG implicitly matching all other characters
   NUM_SYMBOL_GROUPS  ///< Total number of symbol groups
 };
@@ -339,42 +340,64 @@ enum class dfa_symbol_group_id : uint8_t {
 constexpr auto TT_NUM_STATES     = static_cast<StateT>(dfa_states::TT_NUM_STATES);
 constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
 
-// The i-th string representing all the characters of a symbol group
-std::array<std::string, NUM_SYMBOL_GROUPS - 1> const symbol_groups{
-  {{"{"}, {"["}, {"}"}, {"]"}, {"\""}, {"\\"}, {"\n"}}};
+// The DFA's starting state
+constexpr auto start_state = static_cast<StateT>(TT_OOS);
 
-// Transition table for the default JSON and JSON lines formats
-std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const transition_table{
-  {/* IN_STATE          {       [       }       ]       "       \      \n    OTHER */
-   /* TT_OOS    */ {{TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS, TT_OOS}},
-   /* TT_STR    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_STR, TT_STR}},
-   /* TT_ESC    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}}};
-
-// Transition table for the JSON lines format that recovers from invalid JSON lines
-std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const
-  resetting_transition_table{
+template <typename SymbolT>
+auto get_sgid_lut(SymbolT delim)
+{
+  // The i-th string representing all the characters of a symbol group
+  std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> symbol_groups{
+    {{'{'}, {'['}, {'}'}, {']'}, {'"'}, {'\\'}, {delim}}};
+
+  return symbol_groups;
+}
+
+auto get_transition_table(stack_behavior_t stack_behavior)
+{
+  // Transition table for the default JSON and JSON lines formats
+  std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const transition_table{
     {/* IN_STATE          {       [       }       ]       "       \      \n    OTHER */
      /* TT_OOS    */ {{TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS, TT_OOS}},
-     /* TT_STR    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_OOS, TT_STR}},
-     /* TT_ESC    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_STR}}}};
-
-// Translation table for the default JSON and JSON lines formats
-std::array<std::array<std::vector<char>, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const translation_table{
-  {/* IN_STATE         {      [      }      ]      "      \     \n    OTHER */
-   /* TT_OOS    */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {}, {}}},
-   /* TT_STR    */ {{{}, {}, {}, {}, {}, {}, {}, {}}},
-   /* TT_ESC    */ {{{}, {}, {}, {}, {}, {}, {}, {}}}}};
-
-// Translation table for the JSON lines format that recovers from invalid JSON lines
-std::array<std::array<std::vector<char>, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const
-  resetting_translation_table{
-    {/* IN_STATE         {      [      }      ]      "      \     \n    OTHER */
-     /* TT_OOS    */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {'\n'}, {}}},
-     /* TT_STR    */ {{{}, {}, {}, {}, {}, {}, {'\n'}, {}}},
-     /* TT_ESC    */ {{{}, {}, {}, {}, {}, {}, {'\n'}, {}}}}};
+     /* TT_STR    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_STR, TT_STR}},
+     /* TT_ESC    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}}};
+
+  // Transition table for the JSON lines format that recovers from invalid JSON lines
+  std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const
+    resetting_transition_table{
+      {/* IN_STATE          {       [       }       ]       "       \      \n    OTHER */
+       /* TT_OOS    */ {{TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS, TT_OOS}},
+       /* TT_STR    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_OOS, TT_STR}},
+       /* TT_ESC    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_STR}}}};
+
+  // Transition table specialized on the choice of whether to reset on newlines
+  return (stack_behavior == stack_behavior_t::ResetOnDelimiter) ? resetting_transition_table
+                                                                : transition_table;
+}
+
+auto get_translation_table(stack_behavior_t stack_behavior)
+{
+  // Translation table for the default JSON and JSON lines formats
+  std::array<std::array<std::vector<char>, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const
+    translation_table{
+      {/* IN_STATE         {      [      }      ]      "      \     <delim>    OTHER */
+       /* TT_OOS    */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {}, {}}},
+       /* TT_STR    */ {{{}, {}, {}, {}, {}, {}, {}, {}}},
+       /* TT_ESC    */ {{{}, {}, {}, {}, {}, {}, {}, {}}}}};
+
+  // Translation table for the JSON lines format that recovers from invalid JSON lines
+  std::array<std::array<std::vector<char>, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const
+    resetting_translation_table{
+      {/* IN_STATE         {      [      }      ]      "      \     <delim>    OTHER */
+       /* TT_OOS    */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {'\n'}, {}}},
+       /* TT_STR    */ {{{}, {}, {}, {}, {}, {}, {'\n'}, {}}},
+       /* TT_ESC    */ {{{}, {}, {}, {}, {}, {}, {'\n'}, {}}}}};
+
+  // Translation table specialized on the choice of whether to reset on newlines
+  return stack_behavior == stack_behavior_t::ResetOnDelimiter ? resetting_translation_table
+                                                              : translation_table;
+}
 
-// The DFA's starting state
-constexpr auto start_state = static_cast<StateT>(TT_OOS);
 }  // namespace to_stack_op
 
 // JSON tokenizer pushdown automaton
@@ -572,6 +595,7 @@ static __constant__ PdaSymbolGroupIdT tos_sg_to_pda_sgid[] = {
  * visibly pushdown automaton (DVPA)
  */
 struct PdaSymbolToSymbolGroupId {
+  SymbolT delimiter = '\n';
   template <typename SymbolT, typename StackSymbolT>
   __device__ __forceinline__ PdaSymbolGroupIdT
   operator()(thrust::tuple<SymbolT, StackSymbolT> symbol_pair) const
@@ -593,8 +617,15 @@ struct PdaSymbolToSymbolGroupId {
     // The relative symbol group id of the current input symbol
     constexpr auto pda_sgid_lookup_size =
       static_cast<int32_t>(sizeof(tos_sg_to_pda_sgid) / sizeof(tos_sg_to_pda_sgid[0]));
+    // We map the delimiter character to LINE_BREAK symbol group id, and the newline character
+    // to OTHER. Note that delimiter cannot be any of opening(closing) brace, bracket, quote,
+    // escape, comma, colon or whitespace characters.
+    auto const symbol_position =
+      symbol == delimiter
+        ? static_cast<int32_t>('\n')
+        : (symbol == '\n' ? static_cast<int32_t>(delimiter) : static_cast<int32_t>(symbol));
     PdaSymbolGroupIdT symbol_gid =
-      tos_sg_to_pda_sgid[min(static_cast<int32_t>(symbol), pda_sgid_lookup_size - 1)];
+      tos_sg_to_pda_sgid[min(symbol_position, pda_sgid_lookup_size - 1)];
     return stack_idx * static_cast<PdaSymbolGroupIdT>(symbol_group_id::NUM_PDA_INPUT_SGS) +
            symbol_gid;
   }
@@ -1398,6 +1429,7 @@ namespace detail {
 void get_stack_context(device_span<SymbolT const> json_in,
                        SymbolT* d_top_of_stack,
                        stack_behavior_t stack_behavior,
+                       SymbolT delimiter,
                        rmm::cuda_stream_view stream)
 {
   check_input_size(json_in.size());
@@ -1423,20 +1455,11 @@ void get_stack_context(device_span<SymbolT const> json_in,
   constexpr auto max_translation_table_size =
     to_stack_op::NUM_SYMBOL_GROUPS * to_stack_op::TT_NUM_STATES;
 
-  // Transition table specialized on the choice of whether to reset on newlines
-  const auto transition_table = (stack_behavior == stack_behavior_t::ResetOnDelimiter)
-                                  ? to_stack_op::resetting_transition_table
-                                  : to_stack_op::transition_table;
-
-  // Translation table specialized on the choice of whether to reset on newlines
-  const auto translation_table = (stack_behavior == stack_behavior_t::ResetOnDelimiter)
-                                   ? to_stack_op::resetting_translation_table
-                                   : to_stack_op::translation_table;
-
   auto json_to_stack_ops_fst = fst::detail::make_fst(
-    fst::detail::make_symbol_group_lut(to_stack_op::symbol_groups),
-    fst::detail::make_transition_table(transition_table),
-    fst::detail::make_translation_table<max_translation_table_size>(translation_table),
+    fst::detail::make_symbol_group_lut(to_stack_op::get_sgid_lut(delimiter)),
+    fst::detail::make_transition_table(to_stack_op::get_transition_table(stack_behavior)),
+    fst::detail::make_translation_table<max_translation_table_size>(
+      to_stack_op::get_translation_table(stack_behavior)),
     stream);
 
   // "Search" for relevant occurrence of brackets and braces that indicate the beginning/end
@@ -1539,16 +1562,16 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   // Range of encapsulating function that parses to internal columnar data representation
   CUDF_FUNC_RANGE();
 
-  auto const new_line_delimited_json = options.is_enabled_lines();
+  auto const delimited_json = options.is_enabled_lines();
+  auto const delimiter      = options.get_delimiter();
 
-  // (!new_line_delimited_json)                         => JSON
-  // (new_line_delimited_json and recover_from_error)   => JSON_LINES_RECOVER
-  // (new_line_delimited_json and !recover_from_error)  => JSON_LINES
-  auto format = new_line_delimited_json
-                  ? (options.recovery_mode() == json_recovery_mode_t::RECOVER_WITH_NULL
-                       ? tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER
-                       : tokenizer_pda::json_format_cfg_t::JSON_LINES)
-                  : tokenizer_pda::json_format_cfg_t::JSON;
+  // (!delimited_json)                         => JSON
+  // (delimited_json and recover_from_error)   => JSON_LINES_RECOVER
+  // (delimited_json and !recover_from_error)  => JSON_LINES
+  auto format = delimited_json ? (options.recovery_mode() == json_recovery_mode_t::RECOVER_WITH_NULL
+                                    ? tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER
+                                    : tokenizer_pda::json_format_cfg_t::JSON_LINES)
+                               : tokenizer_pda::json_format_cfg_t::JSON;
 
   // Prepare for PDA transducer pass, merging input symbols with stack symbols
   auto const recover_from_error = (format == tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER);
@@ -1559,7 +1582,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   // Identify what is the stack context for each input character (JSON-root, struct, or list)
   auto const stack_behavior =
     recover_from_error ? stack_behavior_t::ResetOnDelimiter : stack_behavior_t::PushPopWithoutReset;
-  get_stack_context(json_in, stack_symbols.data(), stack_behavior, stream);
+  get_stack_context(json_in, stack_symbols.data(), stack_behavior, delimiter, stream);
 
   // Input to the full pushdown automaton finite-state transducer, where a input symbol comprises
   // the combination of a character from the JSON input together with the stack context for that
@@ -1573,7 +1596,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   if (recover_from_error) {
     auto fix_stack_of_excess_chars = fst::detail::make_fst(
       fst::detail::make_symbol_group_lookup_op(
-        fix_stack_of_excess_chars::SymbolPairToSymbolGroupId{}),
+        fix_stack_of_excess_chars::SymbolPairToSymbolGroupId{delimiter}),
       fst::detail::make_transition_table(fix_stack_of_excess_chars::transition_table),
       fst::detail::make_translation_functor(fix_stack_of_excess_chars::TransduceInputOp{}),
       stream);
@@ -1592,8 +1615,9 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   constexpr auto max_translation_table_size =
     tokenizer_pda::NUM_PDA_SGIDS *
     static_cast<tokenizer_pda::StateT>(tokenizer_pda::pda_state_t::PD_NUM_STATES);
+
   auto json_to_tokens_fst = fst::detail::make_fst(
-    fst::detail::make_symbol_group_lookup_op(tokenizer_pda::PdaSymbolToSymbolGroupId{}),
+    fst::detail::make_symbol_group_lookup_op(tokenizer_pda::PdaSymbolToSymbolGroupId{delimiter}),
     fst::detail::make_transition_table(tokenizer_pda::get_transition_table(format)),
     fst::detail::make_translation_table<max_translation_table_size>(
       tokenizer_pda::get_translation_table(recover_from_error)),
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index b25822f6613..35e6adf20e7 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -2434,6 +2434,84 @@ TEST_F(JsonReaderTest, MapTypes)
           {type_id::LIST, type_id::STRING, type_id::STRING});
 }
 
+/**
+ * @brief Test fixture for parametrized JSON reader tests
+ */
+struct JsonDelimiterParamTest : public cudf::test::BaseFixture,
+                                public testing::WithParamInterface<char> {};
+
+// Parametrize qualifying JSON tests for executing both nested reader and legacy JSON lines reader
+INSTANTIATE_TEST_SUITE_P(JsonDelimiterParamTest,
+                         JsonDelimiterParamTest,
+                         ::testing::Values('\n', '\b', '\v', '\f', 'h'));
+
+TEST_P(JsonDelimiterParamTest, JsonLinesDelimiter)
+{
+  using SymbolT = char;
+
+  SymbolT const random_delimiter = GetParam();
+
+  // Test input
+  std::string input             = R"({"col1":100, "col2":1.1, "col3":"aaa"})";
+  std::size_t const string_size = 400;
+  /*
+   * We are constructing a JSON lines string where each row is {"col1":100, "col2":1.1,
+   * "col3":"aaa"} and rows are separated by random_delimiter. Instead of concatenating lines
+   * linearly in O(n), we can do it in O(log n) by doubling the input in each iteration. The total
+   * number of such iterations is log_repetitions.
+   */
+  std::size_t const log_repetitions =
+    static_cast<std::size_t>(std::ceil(std::log2(string_size / input.size())));
+  std::size_t const repetitions = 1UL << log_repetitions;
+  for (std::size_t i = 0; i < log_repetitions; i++) {
+    input = input + random_delimiter + input;
+  }
+
+  cudf::io::json_reader_options json_parser_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{input.c_str(), input.size()})
+      .lines(true)
+      .delimiter(random_delimiter);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(json_parser_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 3);
+  EXPECT_EQ(result.tbl->num_rows(), repetitions);
+
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64);
+  EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::STRING);
+
+  EXPECT_EQ(result.metadata.schema_info[0].name, "col1");
+  EXPECT_EQ(result.metadata.schema_info[1].name, "col2");
+  EXPECT_EQ(result.metadata.schema_info[2].name, "col3");
+
+  auto col1_iterator = thrust::constant_iterator<int64_t>(100);
+  auto col2_iterator = thrust::constant_iterator<double>(1.1);
+  auto col3_iterator = thrust::constant_iterator<std::string>("aaa");
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
+                                 int64_wrapper(col1_iterator, col1_iterator + repetitions));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1),
+                                 float64_wrapper(col2_iterator, col2_iterator + repetitions));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(2),
+    cudf::test::strings_column_wrapper(col3_iterator, col3_iterator + repetitions));
+}
+
+TEST_F(JsonReaderTest, ViableDelimiter)
+{
+  // Test input
+  std::string input = R"({"col1":100, "col2":1.1, "col3":"aaa"})";
+
+  cudf::io::json_reader_options json_parser_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{input.c_str(), input.size()})
+      .lines(true);
+
+  json_parser_options.set_delimiter('\f');
+  CUDF_EXPECT_NO_THROW(cudf::io::read_json(json_parser_options));
+
+  EXPECT_THROW(json_parser_options.set_delimiter('\t'), std::invalid_argument);
+}
+
 // Test case for dtype prune:
 // all paths, only one.
 // one present, another not present, nothing present
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp
index 112ee8fb57b..d6f800cce8b 100644
--- a/cpp/tests/io/nested_json_test.cpp
+++ b/cpp/tests/io/nested_json_test.cpp
@@ -148,6 +148,7 @@ TEST_F(JsonTest, StackContext)
   auto const stream = cudf::get_default_stream();
 
   // Test input
+  char const delimiter    = 'h';
   std::string const input = R"(  [{)"
                             R"("category": "reference",)"
                             R"("index:": [4,12,42],)"
@@ -171,7 +172,8 @@ TEST_F(JsonTest, StackContext)
 
   // Run algorithm
   constexpr auto stack_behavior = cuio_json::stack_behavior_t::PushPopWithoutReset;
-  cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stack_behavior, stream);
+  cuio_json::detail::get_stack_context(
+    d_input, stack_context.device_ptr(), stack_behavior, delimiter, stream);
 
   // Copy back the results
   stack_context.device_to_host_async(stream);
@@ -210,6 +212,7 @@ TEST_F(JsonTest, StackContextUtf8)
   auto const stream = cudf::get_default_stream();
 
   // Test input
+  char const delimiter    = 'h';
   std::string const input = R"([{"a":{"year":1882,"author": "Bharathi"}, {"a":"filip ʒakotɛ"}}])";
 
   // Prepare input & output buffers
@@ -220,7 +223,8 @@ TEST_F(JsonTest, StackContextUtf8)
 
   // Run algorithm
   constexpr auto stack_behavior = cuio_json::stack_behavior_t::PushPopWithoutReset;
-  cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stack_behavior, stream);
+  cuio_json::detail::get_stack_context(
+    d_input, stack_context.device_ptr(), stack_behavior, delimiter, stream);
 
   // Copy back the results
   stack_context.device_to_host_async(stream);
@@ -238,7 +242,18 @@ TEST_F(JsonTest, StackContextUtf8)
   CUDF_TEST_EXPECT_VECTOR_EQUAL(golden_stack_context, stack_context, stack_context.size());
 }
 
-TEST_F(JsonTest, StackContextRecovering)
+/**
+ * @brief Test fixture for parametrized JSON reader tests
+ */
+struct JsonDelimiterParamTest : public cudf::test::BaseFixture,
+                                public testing::WithParamInterface<char> {};
+
+// Parametrize qualifying JSON tests for executing both nested reader and legacy JSON lines reader
+INSTANTIATE_TEST_SUITE_P(JsonDelimiterParamTest,
+                         JsonDelimiterParamTest,
+                         ::testing::Values('\n', '\b', '\v', '\f', 'h'));
+
+TEST_P(JsonDelimiterParamTest, StackContextRecovering)
 {
   // Type used to represent the atomic symbol type used within the finite-state machine
   using SymbolT      = char;
@@ -248,13 +263,15 @@ TEST_F(JsonTest, StackContextRecovering)
   auto const stream = cudf::get_default_stream();
 
   // JSON lines input that recovers on invalid lines
-  std::string const input = R"({"a":-2},
+  char const delimiter = GetParam();
+  std::string input    = R"({"a":-2},
   {"a":
   {"a":{"a":[321
   {"a":[1]}
 
   {"b":123}
   )";
+  std::replace(input.begin(), input.end(), '\n', delimiter);
 
   // Expected stack context (including stack context of the newline characters)
   std::string const golden_stack_context =
@@ -274,7 +291,8 @@ TEST_F(JsonTest, StackContextRecovering)
 
   // Run algorithm
   constexpr auto stack_behavior = cuio_json::stack_behavior_t::ResetOnDelimiter;
-  cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stack_behavior, stream);
+  cuio_json::detail::get_stack_context(
+    d_input, stack_context.device_ptr(), stack_behavior, delimiter, stream);
 
   // Copy back the results
   stack_context.device_to_host_async(stream);
@@ -287,15 +305,16 @@ TEST_F(JsonTest, StackContextRecovering)
   CUDF_TEST_EXPECT_VECTOR_EQUAL(golden_stack_context, stack_context, stack_context.size());
 }
 
-TEST_F(JsonTest, StackContextRecoveringFuzz)
+TEST_P(JsonDelimiterParamTest, StackContextRecoveringFuzz)
 {
   // Type used to represent the atomic symbol type used within the finite-state machine
   using SymbolT      = char;
   using StackSymbolT = char;
 
-  std::random_device rd;
+  char const delimiter = GetParam();
   std::mt19937 gen(42);
   std::uniform_int_distribution<int> distribution(0, 4);
+
   constexpr std::size_t input_length = 1024 * 1024;
   std::string input{};
   input.reserve(input_length);
@@ -313,36 +332,29 @@ TEST_F(JsonTest, StackContextRecoveringFuzz)
         case 1: current = '['; break;
         case 2: current = '}'; break;
         case 3: current = '"'; break;
-        case 4: current = '\n'; break;
+        case 4: current = delimiter; break;
       }
-      switch (current) {
-        case '"': inside_quotes = !inside_quotes; break;
-        case '{':
-          if (!inside_quotes) { host_stack.push('{'); }
-          break;
-        case '[':
-          if (!inside_quotes) { host_stack.push('['); }
-          break;
-        case '}':
-          if (!inside_quotes) {
-            if (host_stack.size() > 0) {
-              // Get the proper 'pop' stack symbol
-              current = (host_stack.top() == '{' ? '}' : ']');
-              host_stack.pop();
-            } else
-              is_ok = false;
-          }
-          break;
-        case '\n':
-          // Increase chance to have longer lines
-          if (distribution(gen) == 0) {
-            is_ok = false;
-            break;
-          } else {
-            host_stack    = {};
-            inside_quotes = false;
-            break;
-          }
+      if (current == '"')
+        inside_quotes = !inside_quotes;
+      else if (current == '{' && !inside_quotes)
+        host_stack.push('{');
+      else if (current == '[' && !inside_quotes)
+        host_stack.push('[');
+      else if (current == '}' && !inside_quotes) {
+        if (host_stack.size() > 0) {
+          // Get the proper 'pop' stack symbol
+          current = (host_stack.top() == '{' ? '}' : ']');
+          host_stack.pop();
+        } else
+          is_ok = false;
+      } else if (current == delimiter) {
+        // Increase chance to have longer lines
+        if (distribution(gen) == 0) {
+          is_ok = false;
+        } else {
+          host_stack    = {};
+          inside_quotes = false;
+        }
       }
     } while (!is_ok);
     input += current;
@@ -360,24 +372,19 @@ TEST_F(JsonTest, StackContextRecoveringFuzz)
       expected_stack_context += host_stack.top();
     }
 
-    switch (current) {
-      case '"': inside_quotes = !inside_quotes; break;
-      case '{':
-        if (!inside_quotes) { host_stack.push('{'); }
-        break;
-      case '[':
-        if (!inside_quotes) { host_stack.push('['); }
-        break;
-      case '}':
-        if (!inside_quotes && host_stack.size() > 0) { host_stack.pop(); }
-        break;
-      case ']':
-        if (!inside_quotes && host_stack.size() > 0) { host_stack.pop(); }
-        break;
-      case '\n':
-        host_stack    = {};
-        inside_quotes = false;
-        break;
+    if (current == '"')
+      inside_quotes = !inside_quotes;
+    else if (current == '{' && !inside_quotes)
+      host_stack.push('{');
+    else if (current == '[' && !inside_quotes)
+      host_stack.push('[');
+    else if (current == '}' && !inside_quotes && host_stack.size() > 0)
+      host_stack.pop();
+    else if (current == ']' && !inside_quotes && host_stack.size() > 0)
+      host_stack.pop();
+    else if (current == delimiter) {
+      host_stack    = {};
+      inside_quotes = false;
     }
   }
 
@@ -392,7 +399,8 @@ TEST_F(JsonTest, StackContextRecoveringFuzz)
 
   // Run algorithm
   constexpr auto stack_behavior = cuio_json::stack_behavior_t::ResetOnDelimiter;
-  cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stack_behavior, stream);
+  cuio_json::detail::get_stack_context(
+    d_input, stack_context.device_ptr(), stack_behavior, delimiter, stream);
 
   // Copy back the results
   stack_context.device_to_host_async(stream);
@@ -404,7 +412,9 @@ TEST_F(JsonTest, StackContextRecoveringFuzz)
   CUDF_TEST_EXPECT_VECTOR_EQUAL(expected_stack_context, stack_context, stack_context.size());
 }
 
-TEST_F(JsonTest, TokenStream)
+struct JsonNewlineDelimiterTest : public cudf::test::BaseFixture {};
+
+TEST_F(JsonNewlineDelimiterTest, TokenStream)
 {
   using cuio_json::PdaTokenT;
   using cuio_json::SymbolOffsetT;
@@ -549,7 +559,7 @@ TEST_F(JsonTest, TokenStream)
   }
 }
 
-TEST_F(JsonTest, TokenStream2)
+TEST_F(JsonNewlineDelimiterTest, TokenStream2)
 {
   using cuio_json::PdaTokenT;
   using cuio_json::SymbolOffsetT;
@@ -653,29 +663,32 @@ TEST_F(JsonParserTest, ExtractColumn)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col2, parsed_col2);
 }
 
-TEST_F(JsonTest, RecoveringTokenStream)
+TEST_P(JsonDelimiterParamTest, RecoveringTokenStream)
 {
   // Test input. Inline comments used to indicate character indexes
   //                           012345678 <= line 0
-  std::string const input = R"({"a":2 {})"
-                            // 9
-                            "\n"
-                            // 01234 <= line 1
-                            R"({"a":)"
-                            // 5
-                            "\n"
-                            // 67890123456789 <= line 2
-                            R"({"a":{"a":[321)"
-                            // 0
-                            "\n"
-                            // 123456789 <= line 3
-                            R"({"a":[1]})"
-                            // 0
-                            "\n"
-                            // 1  <= line 4
-                            "\n"
-                            // 23456789 <= line 5
-                            R"({"b":123})";
+  char const delimiter = GetParam();
+
+  std::string input = R"({"a":2 {})"
+                      // 9
+                      "\n"
+                      // 01234 <= line 1
+                      R"({"a":)"
+                      // 5
+                      "\n"
+                      // 67890123456789 <= line 2
+                      R"({"a":{"a":[321)"
+                      // 0
+                      "\n"
+                      // 123456789 <= line 3
+                      R"({"a":[1]})"
+                      // 0
+                      "\n"
+                      // 1  <= line 4
+                      "\n"
+                      // 23456789 <= line 5
+                      R"({"b":123})";
+  std::replace(input.begin(), input.end(), '\n', delimiter);
 
   // Golden token stream sample
   using token_t = cuio_json::token_t;
@@ -717,6 +730,7 @@ TEST_F(JsonTest, RecoveringTokenStream)
   cudf::io::json_reader_options default_options{};
   default_options.set_recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
   default_options.enable_lines(true);
+  default_options.set_delimiter(delimiter);
 
   // Prepare input & output buffers
   cudf::string_scalar const d_scalar(input, true, stream);
@@ -730,6 +744,7 @@ TEST_F(JsonTest, RecoveringTokenStream)
   auto const tokens_gpu        = cudf::detail::make_std_vector_async(d_tokens_gpu, stream);
   auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream);
 
+  stream.synchronize();
   // Verify the number of tokens matches
   ASSERT_EQ(golden_token_stream.size(), tokens_gpu.size());
   ASSERT_EQ(golden_token_stream.size(), token_indices_gpu.size());
@@ -864,25 +879,29 @@ TEST_F(JsonTest, PostProcessTokenStream)
   }
 }
 
-TEST_F(JsonParserTest, UTF_JSON)
+TEST_P(JsonDelimiterParamTest, UTF_JSON)
 {
   // Prepare cuda stream for data transfers & kernels
-  auto const stream = cudf::get_default_stream();
-  auto mr           = rmm::mr::get_current_device_resource();
-  auto json_parser  = cuio_json::detail::device_parse_nested_json;
+  auto const stream    = cudf::get_default_stream();
+  auto mr              = rmm::mr::get_current_device_resource();
+  auto json_parser     = cuio_json::detail::device_parse_nested_json;
+  char const delimiter = GetParam();
 
   // Default parsing options
   cudf::io::json_reader_options default_options{};
+  default_options.set_delimiter(delimiter);
 
   // Only ASCII string
-  std::string const ascii_pass = R"([
+  std::string ascii_pass = R"([
   {"a":1,"b":2,"c":[3], "d": {}},
   {"a":1,"b":4.0,"c":[], "d": {"year":1882,"author": "Bharathi"}},
   {"a":1,"b":6.0,"c":[5, 7], "d": null},
   {"a":1,"b":8.0,"c":null, "d": {}},
   {"a":1,"b":null,"c":null},
   {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}}])";
-  auto const d_ascii_pass      = cudf::detail::make_device_uvector_sync(
+  std::replace(ascii_pass.begin(), ascii_pass.end(), '\n', delimiter);
+
+  auto const d_ascii_pass = cudf::detail::make_device_uvector_sync(
     cudf::host_span<char const>{ascii_pass.c_str(), ascii_pass.size()},
     stream,
     rmm::mr::get_current_device_resource());
@@ -890,21 +909,23 @@ TEST_F(JsonParserTest, UTF_JSON)
   CUDF_EXPECT_NO_THROW(json_parser(d_ascii_pass, default_options, stream, mr));
 
   // utf-8 string that fails parsing.
-  std::string const utf_failed = R"([
+  std::string utf_failed = R"([
   {"a":1,"b":2,"c":[3], "d": {}},
   {"a":1,"b":4.0,"c":[], "d": {"year":1882,"author": "Bharathi"}},
   {"a":1,"b":6.0,"c":[5, 7], "d": null},
   {"a":1,"b":8.0,"c":null, "d": {}},
   {"a":1,"b":null,"c":null},
   {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "filip ʒakotɛ"}}])";
-  auto const d_utf_failed      = cudf::detail::make_device_uvector_sync(
+  std::replace(utf_failed.begin(), utf_failed.end(), '\n', delimiter);
+
+  auto const d_utf_failed = cudf::detail::make_device_uvector_sync(
     cudf::host_span<char const>{utf_failed.c_str(), utf_failed.size()},
     stream,
     rmm::mr::get_current_device_resource());
   CUDF_EXPECT_NO_THROW(json_parser(d_utf_failed, default_options, stream, mr));
 
   // utf-8 string that passes parsing.
-  std::string const utf_pass = R"([
+  std::string utf_pass = R"([
   {"a":1,"b":2,"c":[3], "d": {}},
   {"a":1,"b":4.0,"c":[], "d": {"year":1882,"author": "Bharathi"}},
   {"a":1,"b":6.0,"c":[5, 7], "d": null},
@@ -912,7 +933,9 @@ TEST_F(JsonParserTest, UTF_JSON)
   {"a":1,"b":null,"c":null},
   {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}},
   {"a":1,"b":NaN,"c":[null, null], "d": {"year": 2, "author": "filip ʒakotɛ"}}])";
-  auto const d_utf_pass      = cudf::detail::make_device_uvector_sync(
+  std::replace(utf_pass.begin(), utf_pass.end(), '\n', delimiter);
+
+  auto const d_utf_pass = cudf::detail::make_device_uvector_sync(
     cudf::host_span<char const>{utf_pass.c_str(), utf_pass.size()},
     stream,
     rmm::mr::get_current_device_resource());
@@ -1017,4 +1040,159 @@ TEST_F(JsonParserTest, EmptyString)
   EXPECT_EQ(cudf_table.tbl->num_columns(), expected_col_count);
 }
 
+TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAndDelimiter)
+{
+  // Test input. Inline comments used to indicate character indexes
+  //                           012345678 <= line 0
+  char const delimiter = GetParam();
+
+  /* Input:
+   * {"a":2}
+   * {"a":<delimiter>{"a":{"a":[321<delimiter>{"a":[1]}
+   *
+   * <delimiter>{"b":123}
+   * {"b":123}
+   */
+  std::string input = R"({"a":2})"
+                      "\n";
+  // starting position 8 (zero indexed)
+  input += R"({"a":)" + std::string(1, delimiter);
+  // starting position 14 (zero indexed)
+  input += R"({"a":{"a":[321)" + std::string(1, delimiter);
+  // starting position 29 (zero indexed)
+  input += R"({"a":[1]})" + std::string("\n\n") + std::string(1, delimiter);
+  // starting position 41 (zero indexed)
+  input += R"({"b":123})"
+           "\n";
+  // starting position 51 (zero indexed)
+  input += R"({"b":123})";
+
+  // Golden token stream sample
+  using token_t = cuio_json::token_t;
+  std::vector<std::pair<std::size_t, cuio_json::PdaTokenT>> golden_token_stream;
+  if (delimiter != '\n') {
+    golden_token_stream.resize(28);
+    golden_token_stream = {// Line 0 (valid)
+                           {0, token_t::StructBegin},
+                           {1, token_t::StructMemberBegin},
+                           {1, token_t::FieldNameBegin},
+                           {3, token_t::FieldNameEnd},
+                           {5, token_t::ValueBegin},
+                           {6, token_t::ValueEnd},
+                           {6, token_t::StructMemberEnd},
+                           {6, token_t::StructEnd},
+                           // Line 1 (invalid)
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           // Line 2 (valid)
+                           {29, token_t::StructBegin},
+                           {30, token_t::StructMemberBegin},
+                           {30, token_t::FieldNameBegin},
+                           {32, token_t::FieldNameEnd},
+                           {34, token_t::ListBegin},
+                           {35, token_t::ValueBegin},
+                           {36, token_t::ValueEnd},
+                           {36, token_t::ListEnd},
+                           {37, token_t::StructMemberEnd},
+                           {37, token_t::StructEnd},
+                           // Line 3 (valid)
+                           {41, token_t::StructBegin},
+                           {42, token_t::StructMemberBegin},
+                           {42, token_t::FieldNameBegin},
+                           {44, token_t::FieldNameEnd},
+                           {46, token_t::ValueBegin},
+                           {49, token_t::ValueEnd},
+                           {49, token_t::StructMemberEnd},
+                           {49, token_t::StructEnd}};
+  } else {
+    /* Input:
+     * {"a":2}
+     * {"a":
+     * {"a":{"a":[321
+     * {"a":[1]}
+     *
+     *
+     * {"b":123}
+     * {"b":123}
+     */
+    golden_token_stream.resize(38);
+    golden_token_stream = {// Line 0 (valid)
+                           {0, token_t::StructBegin},
+                           {1, token_t::StructMemberBegin},
+                           {1, token_t::FieldNameBegin},
+                           {3, token_t::FieldNameEnd},
+                           {5, token_t::ValueBegin},
+                           {6, token_t::ValueEnd},
+                           {6, token_t::StructMemberEnd},
+                           {6, token_t::StructEnd},
+                           // Line 1 (invalid)
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           // Line 2 (invalid)
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           // Line 3 (valid)
+                           {29, token_t::StructBegin},
+                           {30, token_t::StructMemberBegin},
+                           {30, token_t::FieldNameBegin},
+                           {32, token_t::FieldNameEnd},
+                           {34, token_t::ListBegin},
+                           {35, token_t::ValueBegin},
+                           {36, token_t::ValueEnd},
+                           {36, token_t::ListEnd},
+                           {37, token_t::StructMemberEnd},
+                           {37, token_t::StructEnd},
+                           // Line 4 (valid)
+                           {41, token_t::StructBegin},
+                           {42, token_t::StructMemberBegin},
+                           {42, token_t::FieldNameBegin},
+                           {44, token_t::FieldNameEnd},
+                           {46, token_t::ValueBegin},
+                           {49, token_t::ValueEnd},
+                           {49, token_t::StructMemberEnd},
+                           {49, token_t::StructEnd},
+                           // Line 5 (valid)
+                           {51, token_t::StructBegin},
+                           {52, token_t::StructMemberBegin},
+                           {52, token_t::FieldNameBegin},
+                           {54, token_t::FieldNameEnd},
+                           {56, token_t::ValueBegin},
+                           {59, token_t::ValueEnd},
+                           {59, token_t::StructMemberEnd},
+                           {59, token_t::StructEnd}};
+  }
+
+  auto const stream = cudf::get_default_stream();
+
+  // Default parsing options
+  cudf::io::json_reader_options default_options{};
+  default_options.set_recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
+  default_options.enable_lines(true);
+  default_options.set_delimiter(delimiter);
+
+  // Prepare input & output buffers
+  cudf::string_scalar const d_scalar(input, true, stream);
+  auto const d_input = cudf::device_span<cuio_json::SymbolT const>{
+    d_scalar.data(), static_cast<size_t>(d_scalar.size())};
+
+  // Parse the JSON and get the token stream
+  auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
+    d_input, default_options, stream, rmm::mr::get_current_device_resource());
+  // Copy back the number of tokens that were written
+  auto const tokens_gpu        = cudf::detail::make_std_vector_async(d_tokens_gpu, stream);
+  auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream);
+
+  stream.synchronize();
+  // Verify the number of tokens matches
+  ASSERT_EQ(golden_token_stream.size(), tokens_gpu.size());
+  ASSERT_EQ(golden_token_stream.size(), token_indices_gpu.size());
+
+  for (std::size_t i = 0; i < tokens_gpu.size(); i++) {
+    // Ensure the index the tokens are pointing to do match
+    EXPECT_EQ(golden_token_stream[i].first, token_indices_gpu[i]) << "Mismatch at #" << i;
+    // Ensure the token category is correct
+    EXPECT_EQ(golden_token_stream[i].second, tokens_gpu[i]) << "Mismatch at #" << i;
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 16e8625fde773d00732134ea985a42156bd8619b Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 20 May 2024 20:29:12 +0200
Subject: [PATCH 243/272] Limit runtime dependency to
 `libarrow>=16.0.0,<16.1.0a0` (#15782)

Fix `libarrow` runtime dependency which is currently broken due to the
release of `libarrow=16.1.0`:

```python
$ python -c "import cudf"
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/conda/envs/rapids/lib/python3.10/site-packages/cudf/__init__.py", line 9, in <module>
    _setup_numba()
  File "/opt/conda/envs/rapids/lib/python3.10/site-packages/cudf/utils/_numba.py", line 124, in _setup_numba
    _get_cc_60_ptx_file()
  File "/opt/conda/envs/rapids/lib/python3.10/site-packages/cudf/utils/_numba.py", line 16, in _get_cc_60_ptx_file
    from cudf._lib import strings_udf
  File "/opt/conda/envs/rapids/lib/python3.10/site-packages/cudf/_lib/__init__.py", line 4, in <module>
    from . import (
ImportError: libarrow.so.1600: cannot open shared object file: No such file or directory
```

---------

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
Co-authored-by: James Lamb <jlamb@nvidia.com>
---
 conda/recipes/libcudf/meta.yaml | 9 +++++++++
 dependencies.yaml               | 8 ++++----
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 76115362b6c..ad2e840c71d 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -86,6 +86,9 @@ outputs:
         {% else %}
         - {{ compiler('cuda') }}
         {% endif %}
+        # TODO: start taking libarrow's run exports again wwhen they're correct for 16.0
+        # ref: https://github.com/conda-forge/arrow-cpp-feedstock/issues/1418
+        - libarrow
     requirements:
       build:
         - cmake {{ cmake_version }}
@@ -105,6 +108,12 @@ outputs:
         - librmm ={{ minor_version }}
         - libkvikio ={{ minor_version }}
         - dlpack {{ dlpack_version }}
+        # TODO: start taking libarrow's run exports again wwhen they're correct for 16.0
+        # ref: https://github.com/conda-forge/arrow-cpp-feedstock/issues/1418
+        - libarrow>=16.0.0,<16.1.0a0
+        - libarrow-acero>=16.0.0,<16.1.0a0
+        - libarrow-dataset>=16.0.0,<16.1.0a0
+        - libparquet>=16.0.0,<16.1.0a0
     test:
       commands:
         - test -f $PREFIX/lib/libcudf.so
diff --git a/dependencies.yaml b/dependencies.yaml
index 4f8f3c16ea1..f20c1591e73 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -341,10 +341,10 @@ dependencies:
       - output_types: conda
         packages:
           # Allow runtime version to float up to minor version
-          - libarrow-acero>=16.0.0,<17.0.0a0
-          - libarrow-dataset>=16.0.0,<17.0.0a0
-          - libarrow>=16.0.0,<17.0.0a0
-          - libparquet>=16.0.0,<17.0.0a0
+          - libarrow-acero>=16.0.0,<16.1.0a0
+          - libarrow-dataset>=16.0.0,<16.1.0a0
+          - libarrow>=16.0.0,<16.1.0a0
+          - libparquet>=16.0.0,<16.1.0a0
   pyarrow_run:
     common:
       - output_types: [conda, requirements, pyproject]

From 58f45269b2f0dc2edada61dd07a57c3cb1cf565e Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 20 May 2024 15:28:50 -0700
Subject: [PATCH 244/272] Add default pinned pool that falls back to new pinned
 allocations (#15665)

Issue #15612

Adds a pooled pinned memory resource that is created on first call to `get_host_memory_resource` or `set_host_memory_resource`.
The pool has a fixed size: 0.5% of the device memory capacity, limited to 100MB. At 100MB, the pool takes ~30ms to initialize. Size of the pool can be overridden with environment variable `LIBCUDF_PINNED_POOL_SIZE`.
If an allocation cannot be done within the pool, a new pinned allocation is performed.
The allocator uses a stream from the global stream pool to initialize and perform synchronous operations (`allocate`/`deallocate`). Users of the resource don't need to be aware of this implementation detail as these operations synchronize before they are completed.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Alessandro Bellina (https://github.com/abellina)
  - Jake Hemstad (https://github.com/jrhemstad)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15665
---
 .../cudf/detail/utilities/stream_pool.hpp     |   7 +-
 cpp/include/cudf/io/memory_resource.hpp       |  19 ++
 cpp/src/io/utilities/config_utils.cpp         | 191 ++++++++++++++++--
 3 files changed, 202 insertions(+), 15 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/stream_pool.hpp b/cpp/include/cudf/detail/utilities/stream_pool.hpp
index 19ef26a10cb..e19cc3ec2f7 100644
--- a/cpp/include/cudf/detail/utilities/stream_pool.hpp
+++ b/cpp/include/cudf/detail/utilities/stream_pool.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -81,6 +81,11 @@ class cuda_stream_pool {
  */
 cuda_stream_pool* create_global_cuda_stream_pool();
 
+/**
+ * @brief Get the global stream pool.
+ */
+cuda_stream_pool& global_cuda_stream_pool();
+
 /**
  * @brief Acquire a set of `cuda_stream_view` objects and synchronize them to an event on another
  * stream.
diff --git a/cpp/include/cudf/io/memory_resource.hpp b/cpp/include/cudf/io/memory_resource.hpp
index ea79d6a3029..e31ebce4b1f 100644
--- a/cpp/include/cudf/io/memory_resource.hpp
+++ b/cpp/include/cudf/io/memory_resource.hpp
@@ -18,6 +18,8 @@
 
 #include <rmm/resource_ref.hpp>
 
+#include <optional>
+
 namespace cudf::io {
 
 /**
@@ -41,4 +43,21 @@ rmm::host_async_resource_ref set_host_memory_resource(rmm::host_async_resource_r
  */
 rmm::host_async_resource_ref get_host_memory_resource();
 
+/**
+ * @brief Options to configure the default host memory resource
+ */
+struct host_mr_options {
+  std::optional<size_t> pool_size;  ///< The size of the pool to use for the default host memory
+                                    ///< resource. If not set, the default pool size is used.
+};
+
+/**
+ * @brief Configure the size of the default host memory resource.
+ *
+ * @throws cudf::logic_error if called after the default host memory resource has been created
+ *
+ * @param opts Options to configure the default host memory resource
+ */
+void config_default_host_memory_resource(host_mr_options const& opts);
+
 }  // namespace cudf::io
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
index 2f7a6131e3d..7720c073a97 100644
--- a/cpp/src/io/utilities/config_utils.cpp
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -16,10 +16,13 @@
 
 #include "config_utils.hpp"
 
+#include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/io/memory_resource.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_device.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
@@ -87,38 +90,198 @@ bool is_stable_enabled() { return is_all_enabled() or get_env_policy() == usage_
 
 }  // namespace nvcomp_integration
 
-inline std::mutex& host_mr_lock()
+}  // namespace detail
+
+namespace {
+class fixed_pinned_pool_memory_resource {
+  using upstream_mr    = rmm::mr::pinned_host_memory_resource;
+  using host_pooled_mr = rmm::mr::pool_memory_resource<upstream_mr>;
+
+ private:
+  upstream_mr upstream_mr_{};
+  size_t pool_size_{0};
+  // Raw pointer to avoid a segfault when the pool is destroyed on exit
+  host_pooled_mr* pool_{nullptr};
+  void* pool_begin_{nullptr};
+  void* pool_end_{nullptr};
+  cuda::stream_ref stream_{cudf::detail::global_cuda_stream_pool().get_stream().value()};
+
+ public:
+  fixed_pinned_pool_memory_resource(size_t size)
+    : pool_size_{size}, pool_{new host_pooled_mr(upstream_mr_, size, size)}
+  {
+    if (pool_size_ == 0) { return; }
+
+    // Allocate full size from the pinned pool to figure out the beginning and end address
+    pool_begin_ = pool_->allocate_async(pool_size_, stream_);
+    pool_end_   = static_cast<void*>(static_cast<uint8_t*>(pool_begin_) + pool_size_);
+    pool_->deallocate_async(pool_begin_, pool_size_, stream_);
+  }
+
+  void* do_allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream)
+  {
+    if (bytes <= pool_size_) {
+      try {
+        return pool_->allocate_async(bytes, alignment, stream);
+      } catch (...) {
+        // If the pool is exhausted, fall back to the upstream memory resource
+      }
+    }
+
+    return upstream_mr_.allocate_async(bytes, alignment, stream);
+  }
+
+  void do_deallocate_async(void* ptr,
+                           std::size_t bytes,
+                           std::size_t alignment,
+                           cuda::stream_ref stream) noexcept
+  {
+    if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr <= pool_end_) {
+      pool_->deallocate_async(ptr, bytes, alignment, stream);
+    } else {
+      upstream_mr_.deallocate_async(ptr, bytes, alignment, stream);
+    }
+  }
+
+  void* allocate_async(std::size_t bytes, cuda::stream_ref stream)
+  {
+    return do_allocate_async(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
+  }
+
+  void* allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream)
+  {
+    return do_allocate_async(bytes, alignment, stream);
+  }
+
+  void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
+  {
+    auto const result = do_allocate_async(bytes, alignment, stream_);
+    stream_.wait();
+    return result;
+  }
+
+  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept
+  {
+    return do_deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
+  }
+
+  void deallocate_async(void* ptr,
+                        std::size_t bytes,
+                        std::size_t alignment,
+                        cuda::stream_ref stream) noexcept
+  {
+    return do_deallocate_async(ptr, bytes, alignment, stream);
+  }
+
+  void deallocate(void* ptr,
+                  std::size_t bytes,
+                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept
+  {
+    deallocate_async(ptr, bytes, alignment, stream_);
+    stream_.wait();
+  }
+
+  bool operator==(fixed_pinned_pool_memory_resource const& other) const
+  {
+    return pool_ == other.pool_ and stream_ == other.stream_;
+  }
+
+  bool operator!=(fixed_pinned_pool_memory_resource const& other) const
+  {
+    return !operator==(other);
+  }
+
+  [[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&,
+                                            cuda::mr::device_accessible) noexcept
+  {
+  }
+
+  [[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&,
+                                            cuda::mr::host_accessible) noexcept
+  {
+  }
+};
+
+static_assert(cuda::mr::resource_with<fixed_pinned_pool_memory_resource,
+                                      cuda::mr::device_accessible,
+                                      cuda::mr::host_accessible>,
+              "");
+
+}  // namespace
+
+CUDF_EXPORT rmm::host_async_resource_ref& make_default_pinned_mr(std::optional<size_t> config_size)
+{
+  static fixed_pinned_pool_memory_resource mr = [config_size]() {
+    auto const size = [&config_size]() -> size_t {
+      if (auto const env_val = getenv("LIBCUDF_PINNED_POOL_SIZE"); env_val != nullptr) {
+        return std::atol(env_val);
+      }
+
+      if (config_size.has_value()) { return *config_size; }
+
+      size_t free{}, total{};
+      CUDF_CUDA_TRY(cudaMemGetInfo(&free, &total));
+      // 0.5% of the total device memory, capped at 100MB
+      return std::min(total / 200, size_t{100} * 1024 * 1024);
+    }();
+
+    // rmm requires the pool size to be a multiple of 256 bytes
+    auto const aligned_size = (size + 255) & ~255;
+    CUDF_LOG_INFO("Pinned pool size = {}", aligned_size);
+
+    // make the pool with max size equal to the initial size
+    return fixed_pinned_pool_memory_resource{aligned_size};
+  }();
+
+  static rmm::host_async_resource_ref mr_ref{mr};
+  return mr_ref;
+}
+
+CUDF_EXPORT std::mutex& host_mr_mutex()
 {
   static std::mutex map_lock;
   return map_lock;
 }
 
-inline rmm::host_async_resource_ref default_pinned_mr()
+// Must be called with the host_mr_mutex mutex held
+CUDF_EXPORT rmm::host_async_resource_ref& make_host_mr(std::optional<host_mr_options> const& opts)
 {
-  static rmm::mr::pinned_host_memory_resource default_mr{};
-  return default_mr;
+  static rmm::host_async_resource_ref* mr_ref = nullptr;
+  if (mr_ref == nullptr) {
+    mr_ref = &make_default_pinned_mr(opts ? opts->pool_size : std::nullopt);
+  } else {
+    // Throw an error if the user tries to reconfigure the default host resource
+    CUDF_EXPECTS(opts == std::nullopt, "The default host memory resource has already been created");
+  }
+
+  return *mr_ref;
 }
 
-CUDF_EXPORT inline auto& host_mr()
+// Must be called with the host_mr_mutex mutex held
+CUDF_EXPORT rmm::host_async_resource_ref& host_mr()
 {
-  static rmm::host_async_resource_ref host_mr = default_pinned_mr();
-  return host_mr;
+  static rmm::host_async_resource_ref mr_ref = make_host_mr(std::nullopt);
+  return mr_ref;
 }
 
-}  // namespace detail
-
 rmm::host_async_resource_ref set_host_memory_resource(rmm::host_async_resource_ref mr)
 {
-  std::lock_guard lock{detail::host_mr_lock()};
-  auto last_mr      = detail::host_mr();
-  detail::host_mr() = mr;
+  std::scoped_lock lock{host_mr_mutex()};
+  auto last_mr = host_mr();
+  host_mr()    = mr;
   return last_mr;
 }
 
 rmm::host_async_resource_ref get_host_memory_resource()
 {
-  std::lock_guard lock{detail::host_mr_lock()};
-  return detail::host_mr();
+  std::scoped_lock lock{host_mr_mutex()};
+  return host_mr();
+}
+
+void config_default_host_memory_resource(host_mr_options const& opts)
+{
+  std::scoped_lock lock{host_mr_mutex()};
+  make_host_mr(opts);
 }
 
 }  // namespace cudf::io

From 4da00eab26b10cd9445d7cb69373608f5685bb01 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 20 May 2024 18:59:35 -0500
Subject: [PATCH 245/272] Raise error when sorting by categorical column in
 dask-cudf (#15788)

Some dask-cudf tests are currently producing a segfault when sorting by categorical columns. These tests were already marked as "xfail". This PR goes one step further, and raises an error in the top-level `sort_values` API. This `NotImplementedError` can be removed as soon as the problem is fixed up-stream (working on this now, but probably won't be available for 24.06).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15788
---
 .../dask_cudf/dask_cudf/expr/_collection.py   | 19 +++++++++++++++++++
 python/dask_cudf/dask_cudf/tests/test_sort.py | 18 ++++++++++++++++--
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py
index d50dfb24256..926b7cfaf0e 100644
--- a/python/dask_cudf/dask_cudf/expr/_collection.py
+++ b/python/dask_cudf/dask_cudf/expr/_collection.py
@@ -15,6 +15,7 @@
 
 from dask import config
 from dask.dataframe.core import is_dataframe_like
+from dask.dataframe.dispatch import is_categorical_dtype
 
 import cudf
 
@@ -81,6 +82,24 @@ def from_dict(cls, *args, **kwargs):
         with config.set({"dataframe.backend": "cudf"}):
             return DXDataFrame.from_dict(*args, **kwargs)
 
+    def sort_values(
+        self,
+        by,
+        **kwargs,
+    ):
+        # Raise if the first column is categorical, otherwise the
+        # upstream divisions logic may produce errors
+        # (See: https://github.com/rapidsai/cudf/issues/11795)
+        check_by = by[0] if isinstance(by, list) else by
+        if is_categorical_dtype(self.dtypes.get(check_by, None)):
+            raise NotImplementedError(
+                "Dask-cudf does not support sorting on categorical "
+                "columns when query-planning is enabled. Please use "
+                "the legacy API for now."
+                f"\n{_LEGACY_WORKAROUND}",
+            )
+        return super().sort_values(by, **kwargs)
+
     def groupby(
         self,
         by,
diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py
index 400600a1598..9d9fe297248 100644
--- a/python/dask_cudf/dask_cudf/tests/test_sort.py
+++ b/python/dask_cudf/dask_cudf/tests/test_sort.py
@@ -10,7 +10,7 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import xfail_dask_expr
+from dask_cudf.tests.utils import QUERY_PLANNING_ON, xfail_dask_expr
 
 
 @pytest.mark.parametrize("ascending", [True, False])
@@ -23,7 +23,7 @@
         pytest.param(
             "d",
             marks=xfail_dask_expr(
-                "Dask-expr fails to sort by categorical column."
+                "Possible segfault when sorting by categorical column.",
             ),
         ),
         ["a", "b"],
@@ -47,6 +47,20 @@ def test_sort_values(nelem, nparts, by, ascending):
     dd.assert_eq(got, expect, check_index=False)
 
 
+@pytest.mark.parametrize("by", ["b", ["b", "a"]])
+def test_sort_values_categorical_raises(by):
+    df = cudf.DataFrame()
+    df["a"] = np.ascontiguousarray(np.arange(10)[::-1])
+    df["b"] = df["a"].astype("category")
+    ddf = dd.from_pandas(df, npartitions=10)
+
+    if QUERY_PLANNING_ON:
+        with pytest.raises(
+            NotImplementedError, match="sorting on categorical"
+        ):
+            ddf.sort_values(by=by)
+
+
 @pytest.mark.parametrize("ascending", [True, False])
 @pytest.mark.parametrize("by", ["a", "b", ["a", "b"]])
 def test_sort_values_single_partition(by, ascending):

From 6b1248e62dd35d9c5343a540cf655fe967a4d02a Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 20 May 2024 20:04:58 -0500
Subject: [PATCH 246/272] Raise errors for unsupported operations on certain
 types (#15712)

Fixes: https://github.com/rapidsai/cudf/issues/15668

This PR raises errors for groupby operations on un-supported types.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15712
---
 python/cudf/cudf/_lib/groupby.pyx      | 42 +++++++++++++++++++++++-
 python/cudf/cudf/tests/test_groupby.py | 45 ++++++++++++++++++++++----
 2 files changed, 80 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 7533ed56647..9d18e023fe8 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -3,7 +3,7 @@ from functools import singledispatch
 
 from pandas.errors import DataError
 
-from cudf.api.types import is_string_dtype
+from cudf.api.types import _is_categorical_dtype, is_string_dtype
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.dtypes import (
     CategoricalDtype,
@@ -167,6 +167,46 @@ cdef class GroupBy:
             included_aggregations_i = []
             col_aggregations = []
             for agg in aggs:
+                str_agg = str(agg)
+                if (
+                    is_string_dtype(col)
+                    and agg not in _STRING_AGGS
+                    and
+                    (
+                        str_agg in {"cumsum", "cummin", "cummax"}
+                        or not (
+                        any(a in str_agg for a in {
+                            "count",
+                            "max",
+                            "min",
+                            "first",
+                            "last",
+                            "nunique",
+                            "unique",
+                            "nth"
+                        })
+                        or (agg is list)
+                        )
+                    )
+                ):
+                    raise TypeError(
+                        f"function is not supported for this dtype: {agg}"
+                    )
+                elif (
+                    _is_categorical_dtype(col)
+                    and agg not in _CATEGORICAL_AGGS
+                    and (
+                        str_agg in {"cumsum", "cummin", "cummax"}
+                        or
+                        not (
+                            any(a in str_agg for a in {"count", "max", "min", "unique"})
+                        )
+                    )
+                ):
+                    raise TypeError(
+                        f"{col.dtype} type does not support {agg} operations"
+                    )
+
                 agg_obj = make_aggregation(agg)
                 if valid_aggregations == "ALL" or agg_obj.kind in valid_aggregations:
                     included_aggregations_i.append((agg, agg_obj.kind))
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index c139b06d20f..674f694a224 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -504,7 +504,6 @@ def test_groupby_apply_jit_unary_reductions(
     func, dtype, dataset, groupby_jit_datasets
 ):
     dataset = groupby_jit_datasets[dataset]
-
     groupby_apply_jit_reductions_test_inner(func, dataset, dtype)
 
 
@@ -1891,9 +1890,6 @@ def test_groupby_nth(n, by):
     assert_groupby_results_equal(expect, got, check_dtype=False)
 
 
-@pytest.mark.xfail(
-    reason="https://github.com/pandas-dev/pandas/issues/43209",
-)
 def test_raise_data_error():
     pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
     gdf = cudf.from_pandas(pdf)
@@ -1904,12 +1900,13 @@ def test_raise_data_error():
     )
 
 
-def test_drop_unsupported_multi_agg():
+def test_multi_agg():
     gdf = cudf.DataFrame(
         {"a": [1, 1, 2, 2], "b": [1, 2, 3, 4], "c": ["a", "b", "c", "d"]}
     )
+    pdf = gdf.to_pandas()
     assert_groupby_results_equal(
-        gdf.groupby("a").agg(["count", "mean"]),
+        pdf.groupby("a").agg({"b": ["count", "mean"], "c": ["count"]}),
         gdf.groupby("a").agg({"b": ["count", "mean"], "c": ["count"]}),
     )
 
@@ -3852,3 +3849,39 @@ def test_group_by_reduce_numeric_only(by, data, func):
     )
     result = getattr(df.groupby(by, sort=True), func)(numeric_only=True)
     assert_eq(expected, result)
+
+
+@pytest.mark.parametrize(
+    "op", ["cummax", "cummin", "cumprod", "cumsum", "mean", "median"]
+)
+def test_group_by_raises_string_error(op):
+    df = cudf.DataFrame({"a": [1, 2, 3, 4, 5], "b": ["a", "b", "c", "d", "e"]})
+
+    with pytest.raises(TypeError):
+        df.groupby(df.a).agg(op)
+
+
+@pytest.mark.parametrize(
+    "op",
+    [
+        "cummax",
+        "cummin",
+        "cumprod",
+        "cumsum",
+        "mean",
+        "median",
+        "prod",
+        "sum",
+        list,
+    ],
+)
+def test_group_by_raises_category_error(op):
+    df = cudf.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5],
+            "b": cudf.Series(["a", "b", "c", "d", "e"], dtype="category"),
+        }
+    )
+
+    with pytest.raises(TypeError):
+        df.groupby(df.a).agg(op)

From 1dd19102d0df7b8523e29a921c62654463278b43 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 20 May 2024 20:18:22 -0500
Subject: [PATCH 247/272] Add support for `PandasArray` for `pandas<2.1.0`
 (#15786)

Only `pandas-2.1.0+` has support for `NumpyExtensionArray` and any version below that only have support for `PandasArray`. This PR make `cudf.pandas` back-ward compatible in that aspect.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - https://github.com/brandon-b-miller
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15786
---
 python/cudf/cudf/pandas/_wrappers/pandas.py   | 40 ++++++++++++++-----
 .../cudf_pandas_tests/test_cudf_pandas.py     |  8 +++-
 2 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 29aaaac245d..2e3880e14f6 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -386,17 +386,35 @@ def Index__new__(cls, *args, **kwargs):
     },
 )
 
-NumpyExtensionArray = make_final_proxy_type(
-    "NumpyExtensionArray",
-    _Unusable,
-    pd.arrays.NumpyExtensionArray,
-    fast_to_slow=_Unusable(),
-    slow_to_fast=_Unusable(),
-    additional_attributes={
-        "_ndarray": _FastSlowAttribute("_ndarray"),
-        "_dtype": _FastSlowAttribute("_dtype"),
-    },
-)
+try:
+    from pandas.arrays import NumpyExtensionArray as pd_NumpyExtensionArray
+
+    NumpyExtensionArray = make_final_proxy_type(
+        "NumpyExtensionArray",
+        _Unusable,
+        pd_NumpyExtensionArray,
+        fast_to_slow=_Unusable(),
+        slow_to_fast=_Unusable(),
+        additional_attributes={
+            "_ndarray": _FastSlowAttribute("_ndarray"),
+            "_dtype": _FastSlowAttribute("_dtype"),
+        },
+    )
+
+except ImportError:
+    from pandas.arrays import PandasArray as pd_PandasArray
+
+    PandasArray = make_final_proxy_type(
+        "PandasArray",
+        _Unusable,
+        pd_PandasArray,
+        fast_to_slow=_Unusable(),
+        slow_to_fast=_Unusable(),
+        additional_attributes={
+            "_ndarray": _FastSlowAttribute("_ndarray"),
+            "_dtype": _FastSlowAttribute("_dtype"),
+        },
+    )
 
 TimedeltaArray = make_final_proxy_type(
     "TimedeltaArray",
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index e3d4f878ad5..75bceea3034 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1241,8 +1241,12 @@ def test_pickle_groupby(dataframe):
 
 def test_numpy_extension_array():
     np_array = np.array([0, 1, 2, 3])
-    xarray = xpd.arrays.NumpyExtensionArray(np_array)
-    array = pd.arrays.NumpyExtensionArray(np_array)
+    try:
+        xarray = xpd.arrays.NumpyExtensionArray(np_array)
+        array = pd.arrays.NumpyExtensionArray(np_array)
+    except AttributeError:
+        xarray = xpd.arrays.PandasArray(np_array)
+        array = pd.arrays.PandasArray(np_array)
 
     tm.assert_equal(xarray, array)
 

From eb7b50a293f47afac8ba4166c7bb0059d940b1c9 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Mon, 20 May 2024 18:33:02 -0700
Subject: [PATCH 248/272] Support filtered I/O in `chunked_parquet_reader` and
 simplify the use of `parquet_reader_options` (#15764)

This PR does the following:

1. It enables the support for filtered I/O in chunked parquet reader.
2. It simplifies the use of `parquet_reader_options` in `parquet::readers` by taking and saving the options at reader construction for later use instead of passing around `options` as arguments from `read()`, `has_next()` and `chunked_read()` to `prepare_data()`, `read_chunk_internal()` and several other internal APIs.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15764
---
 cpp/include/cudf/io/detail/parquet.hpp       |   4 +-
 cpp/src/io/functions.cpp                     |   2 +-
 cpp/src/io/parquet/reader.cpp                |  12 +-
 cpp/src/io/parquet/reader_impl.cpp           |  89 ++++++--------
 cpp/src/io/parquet/reader_impl.hpp           | 118 +++++++++----------
 cpp/src/io/parquet/reader_impl_chunking.cu   |  20 ++--
 cpp/src/io/parquet/reader_impl_preprocess.cu |  29 ++---
 7 files changed, 116 insertions(+), 158 deletions(-)

diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index fcf5f0d9290..978216d971e 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -76,11 +76,9 @@ class reader {
   /**
    * @brief Reads the dataset as per given options.
    *
-   * @param options Settings for controlling reading behavior
-   *
    * @return The set of columns along with table metadata
    */
-  table_with_metadata read(parquet_reader_options const& options);
+  table_with_metadata read();
 };
 
 /**
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 0358a1a6b86..3ba2facf276 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -548,7 +548,7 @@ table_with_metadata read_parquet(parquet_reader_options const& options,
   auto reader =
     std::make_unique<detail_parquet::reader>(std::move(datasources), options, stream, mr);
 
-  return reader->read(options);
+  return reader->read();
 }
 
 parquet_metadata read_parquet_metadata(source_info const& src_info)
diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp
index 170f7503134..8dfd68cd9b8 100644
--- a/cpp/src/io/parquet/reader.cpp
+++ b/cpp/src/io/parquet/reader.cpp
@@ -32,17 +32,7 @@ reader::reader(std::vector<std::unique_ptr<datasource>>&& sources,
 
 reader::~reader() = default;
 
-table_with_metadata reader::read(parquet_reader_options const& options)
-{
-  // if the user has specified custom row bounds
-  bool const uses_custom_row_bounds =
-    options.get_num_rows().has_value() || options.get_skip_rows() != 0;
-  return _impl->read(options.get_skip_rows(),
-                     options.get_num_rows(),
-                     uses_custom_row_bounds,
-                     options.get_row_groups(),
-                     options.get_filter());
-}
+table_with_metadata reader::read() { return _impl->read(); }
 
 chunked_reader::chunked_reader(std::size_t chunk_read_limit,
                                std::size_t pass_read_limit,
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index fba95093c9c..1bd2fae281c 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -46,7 +46,7 @@ inline bool is_treat_fixed_length_as_string(thrust::optional<LogicalType> const&
 
 }  // namespace
 
-void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_rows, size_t num_rows)
+void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num_rows)
 {
   auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
@@ -88,7 +88,7 @@ void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_row
                is_treat_fixed_length_as_string(chunk.logical_type);
       });
 
-    if (!_has_page_index || uses_custom_row_bounds || has_flba) {
+    if (!_has_page_index || uses_custom_row_bounds(mode) || has_flba) {
       ComputePageStringSizes(subpass.pages,
                              pass.chunks,
                              delta_temp_buf,
@@ -419,6 +419,10 @@ reader::impl::impl(std::size_t chunk_read_limit,
                    rmm::device_async_resource_ref mr)
   : _stream{stream},
     _mr{mr},
+    _options{options.get_timestamp_type(),
+             options.get_skip_rows(),
+             options.get_num_rows(),
+             options.get_row_groups()},
     _sources{std::move(sources)},
     _output_chunk_read_limit{chunk_read_limit},
     _input_pass_read_limit{pass_read_limit}
@@ -427,11 +431,6 @@ reader::impl::impl(std::size_t chunk_read_limit,
   _metadata =
     std::make_unique<aggregate_reader_metadata>(_sources, options.is_enabled_use_arrow_schema());
 
-  // Override output timestamp resolution if requested
-  if (options.get_timestamp_type().id() != type_id::EMPTY) {
-    _timestamp_type = options.get_timestamp_type();
-  }
-
   // Strings may be returned as either string or categorical columns
   _strings_to_categorical = options.is_enabled_convert_strings_to_categories();
 
@@ -452,19 +451,21 @@ reader::impl::impl(std::size_t chunk_read_limit,
                               filter_columns_names,
                               options.is_enabled_use_pandas_metadata(),
                               _strings_to_categorical,
-                              _timestamp_type.id());
+                              _options.timestamp_type.id());
 
   // Save the states of the output buffers for reuse in `chunk_read()`.
   for (auto const& buff : _output_buffers) {
     _output_buffers_template.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff));
   }
+
+  // Save the name to reference converter to extract output filter AST in
+  // `preprocess_file()` and `finalize_output()`
+  table_metadata metadata;
+  populate_metadata(metadata);
+  _expr_conv = named_to_reference_converter(options.get_filter(), metadata);
 }
 
-void reader::impl::prepare_data(int64_t skip_rows,
-                                std::optional<size_type> const& num_rows,
-                                bool uses_custom_row_bounds,
-                                host_span<std::vector<size_type> const> row_group_indices,
-                                std::optional<std::reference_wrapper<ast::expression const>> filter)
+void reader::impl::prepare_data(read_mode mode)
 {
   // if we have not preprocessed at the whole-file level, do that now
   if (!_file_preprocessed) {
@@ -472,14 +473,12 @@ void reader::impl::prepare_data(int64_t skip_rows,
     // - read row group information
     // - setup information on (parquet) chunks
     // - compute schedule of input passes
-    preprocess_file(skip_rows, num_rows, row_group_indices, filter);
+    preprocess_file(mode);
   }
 
   // handle any chunking work (ratcheting through the subpasses and chunks within
   // our current pass) if in bounds
-  if (_file_itm_data._current_input_pass < _file_itm_data.num_passes()) {
-    handle_chunking(uses_custom_row_bounds);
-  }
+  if (_file_itm_data._current_input_pass < _file_itm_data.num_passes()) { handle_chunking(mode); }
 }
 
 void reader::impl::populate_metadata(table_metadata& out_metadata)
@@ -498,8 +497,7 @@ void reader::impl::populate_metadata(table_metadata& out_metadata)
                                      out_metadata.per_file_user_data[0].end()};
 }
 
-table_with_metadata reader::impl::read_chunk_internal(
-  bool uses_custom_row_bounds, std::optional<std::reference_wrapper<ast::expression const>> filter)
+table_with_metadata reader::impl::read_chunk_internal(read_mode mode)
 {
   // If `_output_metadata` has been constructed, just copy it over.
   auto out_metadata = _output_metadata ? table_metadata{*_output_metadata} : table_metadata{};
@@ -510,17 +508,17 @@ table_with_metadata reader::impl::read_chunk_internal(
   out_columns.reserve(_output_buffers.size());
 
   // no work to do (this can happen on the first pass if we have no rows to read)
-  if (!has_more_work()) { return finalize_output(out_metadata, out_columns, filter); }
+  if (!has_more_work()) { return finalize_output(out_metadata, out_columns); }
 
   auto& pass            = *_pass_itm_data;
   auto& subpass         = *pass.subpass;
   auto const& read_info = subpass.output_chunk_read_info[subpass.current_output_chunk];
 
   // Allocate memory buffers for the output columns.
-  allocate_columns(read_info.skip_rows, read_info.num_rows, uses_custom_row_bounds);
+  allocate_columns(mode, read_info.skip_rows, read_info.num_rows);
 
   // Parse data into the output buffers.
-  decode_page_data(uses_custom_row_bounds, read_info.skip_rows, read_info.num_rows);
+  decode_page_data(mode, read_info.skip_rows, read_info.num_rows);
 
   // Create the final output cudf columns.
   for (size_t i = 0; i < _output_buffers.size(); ++i) {
@@ -547,13 +545,11 @@ table_with_metadata reader::impl::read_chunk_internal(
   }
 
   // Add empty columns if needed. Filter output columns based on filter.
-  return finalize_output(out_metadata, out_columns, filter);
+  return finalize_output(out_metadata, out_columns);
 }
 
-table_with_metadata reader::impl::finalize_output(
-  table_metadata& out_metadata,
-  std::vector<std::unique_ptr<column>>& out_columns,
-  std::optional<std::reference_wrapper<ast::expression const>> filter)
+table_with_metadata reader::impl::finalize_output(table_metadata& out_metadata,
+                                                  std::vector<std::unique_ptr<column>>& out_columns)
 {
   // Create empty columns as needed (this can happen if we've ended up with no actual data to read)
   for (size_t i = out_columns.size(); i < _output_buffers.size(); ++i) {
@@ -581,10 +577,13 @@ table_with_metadata reader::impl::finalize_output(
   // increment the output chunk count
   _file_itm_data._output_chunk_count++;
 
-  if (filter.has_value()) {
+  // check if the output filter AST expression (= _expr_conv.get_converted_expr()) exists
+  if (_expr_conv.get_converted_expr().has_value()) {
     auto read_table = std::make_unique<table>(std::move(out_columns));
-    auto predicate  = cudf::detail::compute_column(
-      *read_table, filter.value().get(), _stream, rmm::mr::get_current_device_resource());
+    auto predicate  = cudf::detail::compute_column(*read_table,
+                                                  _expr_conv.get_converted_expr().value().get(),
+                                                  _stream,
+                                                  rmm::mr::get_current_device_resource());
     CUDF_EXPECTS(predicate->view().type().id() == type_id::BOOL8,
                  "Predicate filter should return a boolean");
     // Exclude columns present in filter only in output
@@ -598,22 +597,13 @@ table_with_metadata reader::impl::finalize_output(
   return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
 }
 
-table_with_metadata reader::impl::read(
-  int64_t skip_rows,
-  std::optional<size_type> const& num_rows,
-  bool uses_custom_row_bounds,
-  host_span<std::vector<size_type> const> row_group_indices,
-  std::optional<std::reference_wrapper<ast::expression const>> filter)
+table_with_metadata reader::impl::read()
 {
   CUDF_EXPECTS(_output_chunk_read_limit == 0,
                "Reading the whole file must not have non-zero byte_limit.");
-  table_metadata metadata;
-  populate_metadata(metadata);
-  auto expr_conv     = named_to_reference_converter(filter, metadata);
-  auto output_filter = expr_conv.get_converted_expr();
 
-  prepare_data(skip_rows, num_rows, uses_custom_row_bounds, row_group_indices, output_filter);
-  return read_chunk_internal(uses_custom_row_bounds, output_filter);
+  prepare_data(read_mode::READ_ALL);
+  return read_chunk_internal(read_mode::READ_ALL);
 }
 
 table_with_metadata reader::impl::read_chunk()
@@ -628,22 +618,13 @@ table_with_metadata reader::impl::read_chunk()
     }
   }
 
-  prepare_data(0 /*skip_rows*/,
-               std::nullopt /*num_rows, `nullopt` means unlimited*/,
-               true /*uses_custom_row_bounds*/,
-               {} /*row_group_indices, empty means read all row groups*/,
-               std::nullopt /*filter*/);
-
-  return read_chunk_internal(true, std::nullopt);
+  prepare_data(read_mode::CHUNKED_READ);
+  return read_chunk_internal(read_mode::CHUNKED_READ);
 }
 
 bool reader::impl::has_next()
 {
-  prepare_data(0 /*skip_rows*/,
-               std::nullopt /*num_rows, `nullopt` means unlimited*/,
-               true /*uses_custom_row_bounds*/,
-               {} /*row_group_indices, empty means read all row groups*/,
-               std::nullopt /*filter*/);
+  prepare_data(read_mode::CHUNKED_READ);
 
   // current_input_pass will only be incremented to be == num_passes after
   // the last chunk in the last subpass in the last pass has been returned
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 04da8eed591..3b8e80a29e6 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -64,20 +64,9 @@ class reader::impl {
   /**
    * @brief Read an entire set or a subset of data and returns a set of columns
    *
-   * @param skip_rows Number of rows to skip from the start
-   * @param num_rows Number of rows to read
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
-   * @param row_group_indices Lists of row groups to read, one per source
-   * @param filter Optional AST expression to filter output rows
-   *
    * @return The set of columns along with metadata
    */
-  table_with_metadata read(int64_t skip_rows,
-                           std::optional<size_type> const& num_rows,
-                           bool uses_custom_row_bounds,
-                           host_span<std::vector<size_type> const> row_group_indices,
-                           std::optional<std::reference_wrapper<ast::expression const>> filter);
+  table_with_metadata read();
 
   /**
    * @brief Constructor from a chunk read limit and an array of dataset sources with reader options.
@@ -132,21 +121,17 @@ class reader::impl {
   // top level functions involved with ratcheting through the passes, subpasses
   // and output chunks of the read process
  private:
+  /**
+   * @brief The enum indicating whether the data sources are read all at once or chunk by chunk.
+   */
+  enum class read_mode { READ_ALL, CHUNKED_READ };
+
   /**
    * @brief Perform the necessary data preprocessing for parsing file later on.
    *
-   * @param skip_rows Number of rows to skip from the start
-   * @param num_rows Number of rows to read, or `std::nullopt` to read all rows
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
-   * @param row_group_indices Lists of row groups to read (one per source), or empty if read all
-   * @param filter Optional AST expression to filter row groups based on column chunk statistics
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  void prepare_data(int64_t skip_rows,
-                    std::optional<size_type> const& num_rows,
-                    bool uses_custom_row_bounds,
-                    host_span<std::vector<size_type> const> row_group_indices,
-                    std::optional<std::reference_wrapper<ast::expression const>> filter);
+  void prepare_data(read_mode mode);
 
   /**
    * @brief Preprocess step for the entire file.
@@ -154,23 +139,16 @@ class reader::impl {
    * Only ever called once. This function reads in rowgroup and associated chunk
    * information and computes the schedule of top level passes (see `pass_intermediate_data`).
    *
-   * @param skip_rows The number of rows to skip in the requested set of rowgroups to be read
-   * @param num_rows The total number of rows to read out of the selected rowgroups
-   * @param row_group_indices Lists of row groups to read, one per source
-   * @param filter Optional AST expression to filter output rows
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  void preprocess_file(int64_t skip_rows,
-                       std::optional<size_type> const& num_rows,
-                       host_span<std::vector<size_type> const> row_group_indices,
-                       std::optional<std::reference_wrapper<ast::expression const>> filter);
+  void preprocess_file(read_mode mode);
 
   /**
    * @brief Ratchet the pass/subpass/chunk process forward.
    *
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specified
-   *        bounds
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  void handle_chunking(bool uses_custom_row_bounds);
+  void handle_chunking(read_mode mode);
 
   /**
    * @brief Setup step for the next input read pass.
@@ -178,36 +156,31 @@ class reader::impl {
    * A 'pass' is defined as a subset of row groups read out of the globally
    * requested set of all row groups.
    *
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  void setup_next_pass(bool uses_custom_row_bounds);
+  void setup_next_pass(read_mode mode);
 
   /**
    * @brief Setup step for the next decompression subpass.
    *
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
-   *
    * A 'subpass' is defined as a subset of pages within a pass that are
    * decompressed and decoded as a batch. Subpasses may be further subdivided
    * into output chunks.
+   *
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
+   *
    */
-  void setup_next_subpass(bool uses_custom_row_bounds);
+  void setup_next_subpass(read_mode mode);
 
   /**
    * @brief Read a chunk of data and return an output table.
    *
    * This function is called internally and expects all preprocessing steps have already been done.
    *
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
-   * @param filter Optional AST expression to filter output rows
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    * @return The output table along with columns' metadata
    */
-  table_with_metadata read_chunk_internal(
-    bool uses_custom_row_bounds,
-    std::optional<std::reference_wrapper<ast::expression const>> filter);
+  table_with_metadata read_chunk_internal(read_mode mode);
 
   // utility functions
  private:
@@ -253,12 +226,11 @@ class reader::impl {
    *
    * For flat schemas, these values are computed during header decoding (see gpuDecodePageHeaders).
    *
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    * @param chunk_read_limit Limit on total number of bytes to be returned per read,
    *        or `0` if there is no limit
    */
-  void preprocess_subpass_pages(bool uses_custom_row_bounds, size_t chunk_read_limit);
+  void preprocess_subpass_pages(read_mode mode, size_t chunk_read_limit);
 
   /**
    * @brief Allocate nesting information storage for all pages and set pointers to it.
@@ -292,23 +264,19 @@ class reader::impl {
    *
    * @param out_metadata The output table metadata
    * @param out_columns The columns for building the output table
-   * @param filter Optional AST expression to filter output rows
    * @return The output table along with columns' metadata
    */
-  table_with_metadata finalize_output(
-    table_metadata& out_metadata,
-    std::vector<std::unique_ptr<column>>& out_columns,
-    std::optional<std::reference_wrapper<ast::expression const>> filter);
+  table_with_metadata finalize_output(table_metadata& out_metadata,
+                                      std::vector<std::unique_ptr<column>>& out_columns);
 
   /**
    * @brief Allocate data buffers for the output columns.
    *
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    * @param skip_rows Crop all rows below skip_rows
    * @param num_rows Maximum number of rows to read
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
    */
-  void allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds);
+  void allocate_columns(read_mode mode, size_t skip_rows, size_t num_rows);
 
   /**
    * @brief Calculate per-page offsets for string data
@@ -320,12 +288,11 @@ class reader::impl {
   /**
    * @brief Converts the page data and outputs to columns.
    *
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    * @param skip_rows Minimum number of rows from start
    * @param num_rows Number of rows to output
    */
-  void decode_page_data(bool uses_custom_row_bounds, size_t skip_rows, size_t num_rows);
+  void decode_page_data(read_mode mode, size_t skip_rows, size_t num_rows);
 
   /**
    * @brief Creates file-wide parquet chunk information.
@@ -354,6 +321,21 @@ class reader::impl {
   }
 
  private:
+  /**
+   * @brief Check if the user has specified custom row bounds
+   *
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
+   * @return True if the user has specified custom row bounds
+   */
+  [[nodiscard]] bool uses_custom_row_bounds(read_mode mode) const
+  {
+    // TODO: `read_mode` is hardcoded to `true` when `read_mode::CHUNKED_READ` to enforce
+    // `ComputePageSizes()` computation for all remaining chunks.
+    return (mode == read_mode::READ_ALL)
+             ? (_options.num_rows.has_value() or _options.skip_rows != 0)
+             : true;
+  }
+
   [[nodiscard]] bool is_first_output_chunk() const
   {
     return _file_itm_data._output_chunk_count == 0;
@@ -362,6 +344,19 @@ class reader::impl {
   rmm::cuda_stream_view _stream;
   rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
 
+  // Reader configs.
+  struct {
+    // timestamp_type
+    data_type timestamp_type{type_id::EMPTY};
+    // User specified reading rows/stripes selection.
+    int64_t const skip_rows;
+    std::optional<int64_t> num_rows;
+    std::vector<std::vector<size_type>> row_group_indices;
+  } const _options;
+
+  // name to reference converter to extract AST output filter
+  named_to_reference_converter _expr_conv{std::nullopt, table_metadata{}};
+
   std::vector<std::unique_ptr<datasource>> _sources;
   std::unique_ptr<aggregate_reader_metadata> _metadata;
 
@@ -389,7 +384,6 @@ class reader::impl {
   bool _has_page_index = false;
 
   std::optional<std::vector<reader_column_schema>> _reader_column_schema;
-  data_type _timestamp_type{type_id::EMPTY};
 
   // chunked reading happens in 2 parts:
   //
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 6824d72cf04..d3f321af0bd 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -1148,12 +1148,12 @@ void include_decompression_scratch_size(device_span<ColumnChunkDesc const> chunk
 
 }  // anonymous namespace
 
-void reader::impl::handle_chunking(bool uses_custom_row_bounds)
+void reader::impl::handle_chunking(read_mode mode)
 {
   // if this is our first time in here, setup the first pass.
   if (!_pass_itm_data) {
     // setup the next pass
-    setup_next_pass(uses_custom_row_bounds);
+    setup_next_pass(mode);
   }
 
   auto& pass = *_pass_itm_data;
@@ -1181,15 +1181,15 @@ void reader::impl::handle_chunking(bool uses_custom_row_bounds)
       if (_file_itm_data._current_input_pass == _file_itm_data.num_passes()) { return; }
 
       // setup the next pass
-      setup_next_pass(uses_custom_row_bounds);
+      setup_next_pass(mode);
     }
   }
 
   // setup the next sub pass
-  setup_next_subpass(uses_custom_row_bounds);
+  setup_next_subpass(mode);
 }
 
-void reader::impl::setup_next_pass(bool uses_custom_row_bounds)
+void reader::impl::setup_next_pass(read_mode mode)
 {
   auto const num_passes = _file_itm_data.num_passes();
 
@@ -1260,7 +1260,7 @@ void reader::impl::setup_next_pass(bool uses_custom_row_bounds)
     detect_malformed_pages(
       pass.pages,
       pass.chunks,
-      uses_custom_row_bounds ? std::nullopt : std::make_optional(pass.num_rows),
+      uses_custom_row_bounds(mode) ? std::nullopt : std::make_optional(pass.num_rows),
       _stream);
 
     // decompress dictionary data if applicable.
@@ -1309,7 +1309,7 @@ void reader::impl::setup_next_pass(bool uses_custom_row_bounds)
   }
 }
 
-void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
+void reader::impl::setup_next_subpass(read_mode mode)
 {
   auto& pass    = *_pass_itm_data;
   pass.subpass  = std::make_unique<subpass_intermediate_data>();
@@ -1444,7 +1444,7 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
 
   // preprocess pages (computes row counts for lists, computes output chunks and computes
   // the actual row counts we will be able load out of this subpass)
-  preprocess_subpass_pages(uses_custom_row_bounds, _output_chunk_read_limit);
+  preprocess_subpass_pages(mode, _output_chunk_read_limit);
 
 #if defined(PARQUET_CHUNK_LOGGING)
   printf("\tSubpass: skip_rows(%'lu), num_rows(%'lu), remaining read limit(%'lu)\n",
@@ -1519,8 +1519,8 @@ void reader::impl::create_global_chunk_info()
       auto& schema   = _metadata->get_schema(col.schema_idx);
 
       auto [clock_rate, logical_type] =
-        conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()),
-                        _timestamp_type.id(),
+        conversion_info(to_type_id(schema, _strings_to_categorical, _options.timestamp_type.id()),
+                        _options.timestamp_type.id(),
                         schema.type,
                         schema.logical_type);
 
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 084f82a2ca0..f533f04e427 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -873,7 +873,7 @@ void reader::impl::allocate_nesting_info()
           nesting_info[cur_depth].max_def_level = cur_schema.max_definition_level;
           pni[cur_depth].size                   = 0;
           pni[cur_depth].type =
-            to_type_id(cur_schema, _strings_to_categorical, _timestamp_type.id());
+            to_type_id(cur_schema, _strings_to_categorical, _options.timestamp_type.id());
           pni[cur_depth].nullable = cur_schema.repetition_type == OPTIONAL;
         }
 
@@ -1221,17 +1221,14 @@ struct update_pass_num_rows {
 
 }  // anonymous namespace
 
-void reader::impl::preprocess_file(
-  int64_t skip_rows,
-  std::optional<size_type> const& num_rows,
-  host_span<std::vector<size_type> const> row_group_indices,
-  std::optional<std::reference_wrapper<ast::expression const>> filter)
+void reader::impl::preprocess_file(read_mode mode)
 {
   CUDF_EXPECTS(!_file_preprocessed, "Attempted to preprocess file more than once");
 
   // if filter is not empty, then create output types as vector and pass for filtering.
+
   std::vector<data_type> output_dtypes;
-  if (filter.has_value()) {
+  if (_expr_conv.get_converted_expr().has_value()) {
     std::transform(_output_buffers_template.cbegin(),
                    _output_buffers_template.cend(),
                    std::back_inserter(output_dtypes),
@@ -1240,12 +1237,12 @@ void reader::impl::preprocess_file(
 
   std::tie(
     _file_itm_data.global_skip_rows, _file_itm_data.global_num_rows, _file_itm_data.row_groups) =
-    _metadata->select_row_groups(row_group_indices,
-                                 skip_rows,
-                                 num_rows,
+    _metadata->select_row_groups(_options.row_group_indices,
+                                 _options.skip_rows,
+                                 _options.num_rows,
                                  output_dtypes,
                                  _output_column_schemas,
-                                 filter,
+                                 _expr_conv.get_converted_expr(),
                                  _stream);
 
   // check for page indexes
@@ -1276,7 +1273,7 @@ void reader::impl::preprocess_file(
   printf("# Input columns: %'lu\n", _input_columns.size());
   for (size_t idx = 0; idx < _input_columns.size(); idx++) {
     auto const& schema = _metadata->get_schema(_input_columns[idx].schema_idx);
-    auto const type_id = to_type_id(schema, _strings_to_categorical, _timestamp_type.id());
+    auto const type_id = to_type_id(schema, _strings_to_categorical, _options.timestamp_type.id());
     printf("\tC(%'lu, %s): %s\n",
            idx,
            _input_columns[idx].name.c_str(),
@@ -1330,7 +1327,7 @@ void reader::impl::generate_list_column_row_count_estimates()
   _stream.synchronize();
 }
 
-void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t chunk_read_limit)
+void reader::impl::preprocess_subpass_pages(read_mode mode, size_t chunk_read_limit)
 {
   auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
@@ -1457,7 +1454,7 @@ void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t
   compute_output_chunks_for_subpass();
 }
 
-void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds)
+void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num_rows)
 {
   auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
@@ -1470,7 +1467,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
   // account. PageInfo::skipped_values, which tells us where to start decoding in the input to
   // respect the user bounds. It is only necessary to do this second pass if uses_custom_row_bounds
   // is set (if the user has specified artificial bounds).
-  if (uses_custom_row_bounds) {
+  if (uses_custom_row_bounds(mode)) {
     ComputePageSizes(subpass.pages,
                      pass.chunks,
                      skip_rows,
@@ -1479,8 +1476,6 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
                      false,  // no need to compute string sizes
                      pass.level_type_size,
                      _stream);
-
-    // print_pages(pages, _stream);
   }
 
   // iterate over all input columns and allocate any associated output

From 70922d0f7fc48bbfeec6c48534642d2ff0d11782 Mon Sep 17 00:00:00 2001
From: er-eis <eeisenberg0@gmail.com>
Date: Mon, 20 May 2024 21:41:52 -0400
Subject: [PATCH 249/272] Add contributing warning about circular imports
 (#15691)

Closes #15689

Adds warning to contributing doc regarding the necessity to rebuild and the type of error a developer may see

Authors:
  - https://github.com/er-eis

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15691
---
 CONTRIBUTING.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 757eaa44510..98c2ec0a22e 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -161,6 +161,8 @@ To build all libraries and tests, with Python packages in development mode, simp
 ./build.sh --pydevelop libcudf libcudf_kafka cudf dask_cudf cudf_kafka custreamz
 ```
 
+- **Note**: if Cython files (`*.pyx` or `*.pxd`) have changed, the Python build must be rerun.
+
 To run the C++ tests, run
 
 ```bash

From 8b7245548c63d1ce84031a0bd187cbfb8e072f8c Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Mon, 20 May 2024 20:52:34 -0500
Subject: [PATCH 250/272] [JNI] Expose java API for
 cudf::io::config_host_memory_resource (#15745)

This PR depends on https://github.com/rapidsai/cudf/pull/15665 and so it won't build until that PR merges.

Adds support for `cudf::io::config_host_memory_resource` which is being worked on in #15665.  In 24.06 we are going to disable the cuDF pinned pool and look into this more in 24.08.

We currently have a pinned pooled resource that has been setup to share pinned memory with other APIs we use from java, so we wanted to prevent extra pinned memory being created by default, and @vuule has added an API for us to call to accomplish this.

Authors:
  - Alessandro Bellina (https://github.com/abellina)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15745
---
 .../main/java/ai/rapids/cudf/PinnedMemoryPool.java  | 13 +++++++++++++
 java/src/main/java/ai/rapids/cudf/Rmm.java          | 10 ++++++++++
 java/src/main/native/src/RmmJni.cpp                 | 11 +++++++++++
 3 files changed, 34 insertions(+)

diff --git a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
index 6cb34683e5a..9038700cb30 100644
--- a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
+++ b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
@@ -252,4 +252,17 @@ private synchronized HostMemoryBuffer tryAllocateInternal(long bytes) {
   private synchronized void free(long address, long size) {
     Rmm.freeFromPinnedPool(this.poolHandle, address, size);
   }
+
+  /**
+   * Sets the size of the cuDF default pinned pool.
+   *
+   * @note This has to be called before cuDF functions are executed.
+   *
+   * @param size initial and maximum size for the cuDF default pinned pool.
+   *        Pass size=0 to disable the default pool.
+   */
+  public static synchronized void configureDefaultCudfPinnedPoolSize(long size) {
+    Rmm.configureDefaultCudfPinnedPoolSize(size);
+  }
+
 }
diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index 6e9f90e477f..fdbdfdfff6f 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -266,6 +266,16 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
     }
   }
 
+  /**
+   * Sets the size of the cuDF default pinned pool.
+   *
+   * @note This has to be called before cuDF functions are executed.
+   *
+   * @param size initial and maximum size for the cuDF default pinned pool.
+   *        Pass size=0 to disable the default pool.
+   */
+  public static synchronized native void configureDefaultCudfPinnedPoolSize(long size);
+
   /**
    * Get the most recently set pool size or -1 if RMM has not been initialized or pooling is
    * not enabled.
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 68453c924d6..9c015fee409 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -1106,4 +1106,15 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeFromFallbackPinnedPool(JNIEnv
   }
   CATCH_STD(env, )
 }
+
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_configureDefaultCudfPinnedPoolSize(JNIEnv* env,
+                                                                                  jclass clazz,
+                                                                                  jlong size)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::io::config_default_host_memory_resource(cudf::io::host_mr_options{size});
+  }
+  CATCH_STD(env, )
+}
 }

From b4daa16f1d67d505abbdd816d4123d4b3a418369 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 21 May 2024 03:56:13 -1000
Subject: [PATCH 251/272] Fix cat.as_ordered not propogating correct size
 (#15780)

closes #15778

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15780
---
 python/cudf/cudf/core/column/categorical.py |  2 +-
 python/cudf/cudf/tests/test_indexing.py     | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index adda8a34cd0..0ff8209dcd4 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1439,7 +1439,7 @@ def as_ordered(self, ordered: bool):
             categories=self.categories,
             codes=self.codes,
             mask=self.base_mask,
-            size=self.base_size,
+            size=self.size,
             offset=self.offset,
             ordered=ordered,
         )
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 16754c3040b..009e48a8669 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -2349,3 +2349,16 @@ def test_loc_datetime_random_with_ts(data, scalar):
         expected = pdf.loc[:i]
 
         assert_eq(actual, expected)
+
+
+def test_sliced_categorical_as_ordered():
+    df = cudf.DataFrame({"a": list("caba"), "b": list(range(4))})
+    df["a"] = df["a"].astype("category")
+    df = df.iloc[:2]
+    result = df["a"].cat.as_ordered()
+    expected = cudf.Series(
+        ["c", "a"],
+        dtype=cudf.CategoricalDtype(list("abc"), ordered=True),
+        name="a",
+    )
+    assert_eq(result, expected)

From 60d5717ba5b9a51cb031b506885a656e50199d22 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 21 May 2024 03:59:27 -1000
Subject: [PATCH 252/272] Improve performance of Series.to_numpy/to_cupy
 (#15792)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

xref https://github.com/rapidsai/cudf/issues/11648

Essentially refactors `Frame._to_array` to short circuit some checks for a `Frame` with 1 column or `ndim == 1`

```python
In [1]: import cudf

In [2]: s = cudf.Series(range(10000))

In [3]: %timeit s.to_cupy()
252 µs ± 3.47 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)  # PR

419 µs ± 2.21 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)  # branch 24.06
```

I needed to add `Frame.ndim` which will raise a `NotImplementedError` (until Frame actually becomes an ABC)

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15792
---
 python/cudf/cudf/core/_base_index.py         |  2 +-
 python/cudf/cudf/core/dataframe.py           |  2 +-
 python/cudf/cudf/core/frame.py               | 85 +++++++++++---------
 python/cudf/cudf/core/multiindex.py          |  2 +-
 python/cudf/cudf/core/single_column_frame.py |  2 +-
 5 files changed, 53 insertions(+), 40 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 6c116e740ff..e6868ae3431 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -145,7 +145,7 @@ def name(self):
         raise NotImplementedError
 
     @property  # type: ignore
-    def ndim(self):  # noqa: D401
+    def ndim(self) -> int:  # noqa: D401
         """Number of dimensions of the underlying data, by definition 1."""
         return 1
 
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 8442cf05f01..88b1ae2ea22 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1234,7 +1234,7 @@ def dtypes(self):
         return pd.Series(self._dtypes, dtype="object")
 
     @property
-    def ndim(self):
+    def ndim(self) -> int:
         """Dimension of the data. DataFrame ndim is always 2."""
         return 2
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 58932db2bda..92ca76d6ceb 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -6,6 +6,7 @@
 import itertools
 import operator
 import pickle
+import types
 import warnings
 from collections import abc
 from typing import (
@@ -91,6 +92,10 @@ def _dtypes(self):
             zip(self._data.names, (col.dtype for col in self._data.columns))
         )
 
+    @property
+    def ndim(self) -> int:
+        raise NotImplementedError()
+
     @_cudf_nvtx_annotate
     def serialize(self):
         # TODO: See if self._data can be serialized outright
@@ -417,51 +422,60 @@ def __arrow_array__(self, type=None):
     @_cudf_nvtx_annotate
     def _to_array(
         self,
-        get_column_values: Callable,
-        make_empty_matrix: Callable,
+        get_array: Callable,
+        module: types.ModuleType,
+        copy: bool,
         dtype: Union[Dtype, None] = None,
         na_value=None,
-    ) -> Union[cupy.ndarray, np.ndarray]:
+    ) -> Union[cupy.ndarray, numpy.ndarray]:
         # Internal function to implement to_cupy and to_numpy, which are nearly
         # identical except for the attribute they access to generate values.
 
-        def get_column_values_na(col):
+        def to_array(
+            col: ColumnBase, dtype: np.dtype
+        ) -> Union[cupy.ndarray, numpy.ndarray]:
             if na_value is not None:
                 col = col.fillna(na_value)
-            return get_column_values(col)
+            array = get_array(col)
+            casted_array = module.asarray(array, dtype=dtype)
+            if copy and casted_array is array:
+                # Don't double copy after asarray
+                casted_array = casted_array.copy()
+            return casted_array
 
-        # Early exit for an empty Frame.
         ncol = self._num_columns
         if ncol == 0:
-            return make_empty_matrix(
-                shape=(len(self), ncol), dtype=np.dtype("float64"), order="F"
+            return module.empty(
+                shape=(len(self), ncol),
+                dtype=numpy.dtype("float64"),
+                order="F",
             )
 
         if dtype is None:
-            dtypes = [col.dtype for col in self._data.values()]
-            for dtype in dtypes:
-                if isinstance(
-                    dtype,
-                    (
-                        cudf.ListDtype,
-                        cudf.core.dtypes.DecimalDtype,
-                        cudf.StructDtype,
-                    ),
-                ):
-                    raise NotImplementedError(
-                        f"{dtype} cannot be exposed as a cupy array"
-                    )
-            dtype = find_common_type(dtypes)
+            if ncol == 1:
+                dtype = next(iter(self._data.values())).dtype
+            else:
+                dtype = find_common_type(
+                    [col.dtype for col in self._data.values()]
+                )
 
-        matrix = make_empty_matrix(
-            shape=(len(self), ncol), dtype=dtype, order="F"
-        )
-        for i, col in enumerate(self._data.values()):
-            # TODO: col.values may fail if there is nullable data or an
-            # unsupported dtype. We may want to catch and provide a more
-            # suitable error.
-            matrix[:, i] = get_column_values_na(col)
-        return matrix
+            if not isinstance(dtype, numpy.dtype):
+                raise NotImplementedError(
+                    f"{dtype} cannot be exposed as an array"
+                )
+
+        if self.ndim == 1:
+            return to_array(self._data.columns[0], dtype)
+        else:
+            matrix = module.empty(
+                shape=(len(self), ncol), dtype=dtype, order="F"
+            )
+            for i, col in enumerate(self._data.values()):
+                # TODO: col.values may fail if there is nullable data or an
+                # unsupported dtype. We may want to catch and provide a more
+                # suitable error.
+                matrix[:, i] = to_array(col, dtype)
+            return matrix
 
     # TODO: As of now, calling cupy.asarray is _much_ faster than calling
     # to_cupy. We should investigate the reasons why and whether we can provide
@@ -496,10 +510,9 @@ def to_cupy(
         cupy.ndarray
         """
         return self._to_array(
-            (lambda col: col.values.copy())
-            if copy
-            else (lambda col: col.values),
-            cupy.empty,
+            lambda col: col.values,
+            cupy,
+            copy,
             dtype,
             na_value,
         )
@@ -536,7 +549,7 @@ def to_numpy(
             )
 
         return self._to_array(
-            (lambda col: col.values_host), np.empty, dtype, na_value
+            lambda col: col.values_host, numpy, copy, dtype, na_value
         )
 
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 58a2846bf43..c149a1028a0 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -563,7 +563,7 @@ def levels(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def ndim(self):
+    def ndim(self) -> int:
         """Dimension of the data. For MultiIndex ndim is always 2."""
         return 2
 
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 829790007c9..d864b563208 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -77,7 +77,7 @@ def name(self, value):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def ndim(self):  # noqa: D401
+    def ndim(self) -> int:  # noqa: D401
         """Number of dimensions of the underlying data, by definition 1."""
         return 1
 

From d78d565b15bd9a2e3200176af4656ee2098b209b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 21 May 2024 07:57:11 -1000
Subject: [PATCH 253/272] Avoid index-to-column conversion in some DataFrame
 ops (#15763)

xref https://github.com/rapidsai/cudf/pull/15494

* For `Index.str`, check the `dtype` instead of the underlying column type (which would materialize RangeIndex)
* For `set_index`, don't immediately convert passed objects to column until necessary
* For `_make_operands_and_index_for_binop`, don't create pandas object more than once

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15763
---
 python/cudf/cudf/core/dataframe.py | 109 +++++++++++------------------
 python/cudf/cudf/core/index.py     |   3 +-
 2 files changed, 43 insertions(+), 69 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 88b1ae2ea22..0b7c40ff516 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2047,29 +2047,24 @@ def _make_operands_and_index_for_binop(
             equal_columns = True
         elif isinstance(other, Series):
             if (
-                not can_reindex
-                and fn in cudf.utils.utils._EQUALITY_OPS
-                and (
-                    not self._data.to_pandas_index().equals(
-                        other.index.to_pandas()
-                    )
+                not (self_pd_columns := self._data.to_pandas_index()).equals(
+                    other_pd_index := other.index.to_pandas()
                 )
+                and not can_reindex
+                and fn in cudf.utils.utils._EQUALITY_OPS
             ):
                 raise ValueError(
                     "Can only compare DataFrame & Series objects "
                     "whose columns & index are same respectively, "
                     "please reindex."
                 )
-            rhs = dict(zip(other.index.to_pandas(), other.values_host))
+            rhs = dict(zip(other_pd_index, other.values_host))
             # For keys in right but not left, perform binops between NaN (not
             # NULL!) and the right value (result is NaN).
             left_default = as_column(np.nan, length=len(self))
-            equal_columns = other.index.to_pandas().equals(
-                self._data.to_pandas_index()
-            )
+            equal_columns = other_pd_index.equals(self_pd_columns)
             can_use_self_column_name = (
-                equal_columns
-                or list(other._index._data.names) == self._data._level_names
+                equal_columns or other_pd_index.names == self_pd_columns.names
             )
         elif isinstance(other, DataFrame):
             if (
@@ -2952,82 +2947,60 @@ def set_index(
 
         if not isinstance(keys, list):
             keys = [keys]
+        if len(keys) == 0:
+            raise ValueError("No valid columns to be added to index.")
+        if append:
+            keys = [self.index] + keys
 
         # Preliminary type check
-        col_not_found = []
-        columns_to_add = []
+        labels_not_found = []
+        data_to_add = []
         names = []
         to_drop = []
         for col in keys:
-            # Is column label
+            # label-like
             if is_scalar(col) or isinstance(col, tuple):
                 if col in self._column_names:
-                    columns_to_add.append(self[col])
+                    data_to_add.append(self[col])
                     names.append(col)
                     if drop:
                         to_drop.append(col)
                 else:
-                    col_not_found.append(col)
+                    labels_not_found.append(col)
+            # index-like
+            elif isinstance(col, (MultiIndex, pd.MultiIndex)):
+                if isinstance(col, pd.MultiIndex):
+                    col = MultiIndex.from_pandas(col)
+                data_to_add.extend(col._data.columns)
+                names.extend(col.names)
+            elif isinstance(
+                col, (cudf.Series, cudf.Index, pd.Series, pd.Index)
+            ):
+                data_to_add.append(col)
+                names.append(col.name)
             else:
-                # Try coerce into column
-                if not is_column_like(col):
-                    try:
-                        col = as_column(col)
-                    except TypeError:
-                        msg = f"{col} cannot be converted to column-like."
-                        raise TypeError(msg)
-                if isinstance(col, (MultiIndex, pd.MultiIndex)):
-                    col = (
-                        cudf.from_pandas(col)
-                        if isinstance(col, pd.MultiIndex)
-                        else col
-                    )
-                    cols = [col._data[x] for x in col._data]
-                    columns_to_add.extend(cols)
-                    names.extend(col.names)
-                else:
-                    if isinstance(col, (pd.RangeIndex, cudf.RangeIndex)):
-                        # Corner case: RangeIndex does not need to instantiate
-                        columns_to_add.append(col)
-                    else:
-                        # For pandas obj, convert to gpu obj
-                        columns_to_add.append(as_column(col))
-                    if isinstance(
-                        col, (cudf.Series, cudf.Index, pd.Series, pd.Index)
-                    ):
-                        names.append(col.name)
-                    else:
-                        names.append(None)
-
-        if col_not_found:
-            raise KeyError(f"None of {col_not_found} are in the columns")
+                try:
+                    col = as_column(col)
+                except TypeError as err:
+                    msg = f"{col} cannot be converted to column-like."
+                    raise TypeError(msg) from err
+                data_to_add.append(col)
+                names.append(None)
 
-        if append:
-            idx_cols = [self.index._data[x] for x in self.index._data]
-            if isinstance(self.index, MultiIndex):
-                idx_names = self.index.names
-            else:
-                idx_names = [self.index.name]
-            columns_to_add = idx_cols + columns_to_add
-            names = idx_names + names
+        if labels_not_found:
+            raise KeyError(f"None of {labels_not_found} are in the columns")
 
-        if len(columns_to_add) == 0:
-            raise ValueError("No valid columns to be added to index.")
-        elif (
-            len(columns_to_add) == 1
+        if (
+            len(data_to_add) == 1
             and len(keys) == 1
             and not isinstance(keys[0], (cudf.MultiIndex, pd.MultiIndex))
         ):
-            idx = cudf.Index(columns_to_add[0], name=names[0])
+            # Don't turn single level MultiIndex into an Index
+            idx = cudf.Index(data_to_add[0], name=names[0])
         else:
-            idx = MultiIndex._from_data(
-                {i: col for i, col in enumerate(columns_to_add)}
-            )
+            idx = MultiIndex._from_data(dict(enumerate(data_to_add)))
             idx.names = names
 
-        if not isinstance(idx, BaseIndex):
-            raise ValueError("Parameter index should be type `Index`.")
-
         df = self if inplace else self.copy(deep=True)
 
         if verify_integrity and not idx.is_unique:
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 209e582e5d6..49bfb150f60 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -38,6 +38,7 @@
     is_integer,
     is_list_like,
     is_scalar,
+    is_string_dtype,
 )
 from cudf.core._base_index import BaseIndex, _return_get_indexer_result
 from cudf.core._compat import PANDAS_LT_300
@@ -1623,7 +1624,7 @@ def _indices_of(self, value):
     @property
     @_cudf_nvtx_annotate
     def str(self):
-        if isinstance(self._values, cudf.core.column.StringColumn):
+        if is_string_dtype(self.dtype):
             return StringMethods(parent=self)
         else:
             raise AttributeError(

From 2c70971ecc66960dcf4bfb2fc6618c7f9f60980f Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 21 May 2024 16:36:01 -0500
Subject: [PATCH 254/272] Upgrade `arrow` to 16.1 (#15787)

This PR upgrades arrow to 16.1

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15787
---
 .../all_cuda-118_arch-x86_64.yaml             | 10 ++++-----
 .../all_cuda-122_arch-x86_64.yaml             | 10 ++++-----
 conda/recipes/cudf/meta.yaml                  |  4 ++--
 conda/recipes/libcudf/conda_build_config.yaml |  2 +-
 conda/recipes/libcudf/meta.yaml               |  9 --------
 cpp/cmake/thirdparty/get_arrow.cmake          |  2 +-
 dependencies.yaml                             | 22 +++++++++----------
 python/cudf/cudf/tests/test_orc.py            |  3 +++
 python/cudf/pyproject.toml                    |  4 ++--
 python/cudf_kafka/pyproject.toml              |  2 +-
 10 files changed, 31 insertions(+), 37 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 48699b81eed..804b09bab59 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -36,15 +36,15 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow-acero==16.0.0.*
-- libarrow-dataset==16.0.0.*
-- libarrow==16.0.0.*
+- libarrow-acero==16.1.0.*
+- libarrow-dataset==16.1.0.*
+- libarrow==16.1.0.*
 - libcufile-dev=1.4.0.31
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
 - libkvikio==24.6.*
-- libparquet==16.0.0.*
+- libparquet==16.1.0.*
 - librdkafka>=1.9.0,<1.10.0a0
 - librmm==24.6.*
 - make
@@ -66,7 +66,7 @@ dependencies:
 - pip
 - pre-commit
 - ptxcompiler
-- pyarrow==16.0.0.*
+- pyarrow==16.1.0.*
 - pydata-sphinx-theme!=0.14.2
 - pytest-benchmark
 - pytest-cases>=3.8.2
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index d06a727f331..89eac98f652 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -37,13 +37,13 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow-acero==16.0.0.*
-- libarrow-dataset==16.0.0.*
-- libarrow==16.0.0.*
+- libarrow-acero==16.1.0.*
+- libarrow-dataset==16.1.0.*
+- libarrow==16.1.0.*
 - libcufile-dev
 - libcurand-dev
 - libkvikio==24.6.*
-- libparquet==16.0.0.*
+- libparquet==16.1.0.*
 - librdkafka>=1.9.0,<1.10.0a0
 - librmm==24.6.*
 - make
@@ -63,7 +63,7 @@ dependencies:
 - pandoc
 - pip
 - pre-commit
-- pyarrow==16.0.0.*
+- pyarrow==16.1.0.*
 - pydata-sphinx-theme!=0.14.2
 - pynvjitlink
 - pytest-benchmark
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 12e29c77a98..e7245e67659 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -64,7 +64,7 @@ requirements:
     - scikit-build-core >=0.7.0
     - dlpack >=0.8,<1.0
     - numpy 1.23
-    - pyarrow ==16.0.0.*
+    - pyarrow ==16.1.0.*
     - libcudf ={{ version }}
     - rmm ={{ minor_version }}
     {% if cuda_major == "11" %}
@@ -82,7 +82,7 @@ requirements:
     - cupy >=12.0.0
     - numba >=0.57
     - {{ pin_compatible('numpy', max_pin='x') }}
-    - {{ pin_compatible('pyarrow', max_pin='x') }}
+    - {{ pin_compatible('pyarrow', max_pin='x.x') }}
     - libcudf ={{ version }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec >=0.6.0
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 61ffcf3c3de..c01178bf732 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -20,7 +20,7 @@ cmake_version:
   - ">=3.26.4"
 
 libarrow_version:
-  - "==16.0.0"
+  - "==16.1.0"
 
 dlpack_version:
   - ">=0.8,<1.0"
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index ad2e840c71d..76115362b6c 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -86,9 +86,6 @@ outputs:
         {% else %}
         - {{ compiler('cuda') }}
         {% endif %}
-        # TODO: start taking libarrow's run exports again wwhen they're correct for 16.0
-        # ref: https://github.com/conda-forge/arrow-cpp-feedstock/issues/1418
-        - libarrow
     requirements:
       build:
         - cmake {{ cmake_version }}
@@ -108,12 +105,6 @@ outputs:
         - librmm ={{ minor_version }}
         - libkvikio ={{ minor_version }}
         - dlpack {{ dlpack_version }}
-        # TODO: start taking libarrow's run exports again wwhen they're correct for 16.0
-        # ref: https://github.com/conda-forge/arrow-cpp-feedstock/issues/1418
-        - libarrow>=16.0.0,<16.1.0a0
-        - libarrow-acero>=16.0.0,<16.1.0a0
-        - libarrow-dataset>=16.0.0,<16.1.0a0
-        - libparquet>=16.0.0,<16.1.0a0
     test:
       commands:
         - test -f $PREFIX/lib/libcudf.so
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 73e66cce608..0afdc526981 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -430,7 +430,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow)
   set(CUDF_VERSION_Arrow
       # This version must be kept in sync with the libarrow version pinned for builds in
       # dependencies.yaml.
-      16.0.0
+      16.1.0
       CACHE STRING "The version of Arrow to find (or build)"
   )
 endif()
diff --git a/dependencies.yaml b/dependencies.yaml
index f20c1591e73..0844d86fb66 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -289,7 +289,7 @@ dependencies:
           - cython>=3.0.3
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - pyarrow==16.0.0.*
+          - pyarrow==16.1.0.*
       - output_types: conda
         packages:
           - scikit-build-core>=0.7.0
@@ -332,25 +332,25 @@ dependencies:
         packages:
           # Hard pin the Arrow patch version used during the build. This must
           # be kept in sync with the version pinned in get_arrow.cmake.
-          - libarrow-acero==16.0.0.*
-          - libarrow-dataset==16.0.0.*
-          - libarrow==16.0.0.*
-          - libparquet==16.0.0.*
+          - libarrow-acero==16.1.0.*
+          - libarrow-dataset==16.1.0.*
+          - libarrow==16.1.0.*
+          - libparquet==16.1.0.*
   libarrow_run:
     common:
       - output_types: conda
         packages:
-          # Allow runtime version to float up to minor version
-          - libarrow-acero>=16.0.0,<16.1.0a0
-          - libarrow-dataset>=16.0.0,<16.1.0a0
-          - libarrow>=16.0.0,<16.1.0a0
-          - libparquet>=16.0.0,<16.1.0a0
+          # Allow runtime version to float up to patch version
+          - libarrow-acero>=16.1.0,<16.2.0a0
+          - libarrow-dataset>=16.1.0,<16.2.0a0
+          - libarrow>=16.1.0,<16.2.0a0
+          - libparquet>=16.1.0,<16.2.0a0
   pyarrow_run:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
           # Allow runtime version to float up to patch version
-          - pyarrow>=16.0.0,<16.1.0a0
+          - pyarrow>=16.1.0,<16.2.0a0
   cuda_version:
     specific:
       - output_types: conda
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index a9bca7d8b98..83b7353ad89 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1833,6 +1833,9 @@ def test_orc_writer_negative_timestamp(negative_timestamp_df):
     )
 
 
+@pytest.mark.skip(
+    reason="Bug specific to rockylinux8: https://github.com/rapidsai/cudf/issues/15802",
+)
 def test_orc_reader_apache_negative_timestamp(datadir):
     path = datadir / "TestOrcFile.apache_timestamp.orc"
 
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 826362f0632..38aa6eeb24e 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -7,7 +7,7 @@ requires = [
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",
-    "pyarrow==16.0.0.*",
+    "pyarrow==16.1.0.*",
     "rmm==24.6.*",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -34,7 +34,7 @@ dependencies = [
     "packaging",
     "pandas>=2.0,<2.2.3dev0",
     "ptxcompiler",
-    "pyarrow>=16.0.0,<16.1.0a0",
+    "pyarrow>=16.1.0,<16.2.0a0",
     "rich",
     "rmm==24.6.*",
     "typing_extensions>=4.0.0",
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 787dd8a97d7..80e30e000c0 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -7,7 +7,7 @@ requires = [
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",
-    "pyarrow==16.0.0.*",
+    "pyarrow==16.1.0.*",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 

From fea8fd611f38dc2610d97caded44b17905efbfa5 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Tue, 21 May 2024 17:51:38 -0500
Subject: [PATCH 255/272] Add multithreaded parquet reader benchmarks. (#15585)

Addresses:  https://github.com/rapidsai/cudf/issues/12700

Adds multithreaded benchmarks for the parquet reader.  Separate benchmarks for the chunked and non-chunked readers.  In both cases, the primary cases are 2, 4 and 8 threads running reads at the same time.   There is not a ton of variability in the other benchmarking axes.

The primary use of this particular benchmark is to see inter-kernel performance (that is, how well do our many different kernel types coexist with each other).  Whereas normal benchmarks tend to be more for intra-kernel performance checking.

NVTX ranges are included to help visually group the bundles of reads together in nsight-sys.   I also posted a new issue which would help along these lines: https://github.com/rapidsai/cudf/issues/15575

Update:  I've tweaked some of the numbers to demonstrate some mild performance improvements as we go up in thread count, and included 1-thread as a case.  Some examples:

```
## parquet_multithreaded_read_decode_mixed
| cardinality | total_data_size | num_threads | num_cols | bytes_per_second |
|-------------|-----------------|-------------|----------|------------------|
|        1000 |       536870912 |           1 |        4 |      28874731473 |
|        1000 |      1073741824 |           1 |        4 |      30564139526 |
|        1000 |       536870912 |           2 |        4 |      29399214255 |
|        1000 |      1073741824 |           2 |        4 |      31486327920 |
|        1000 |       536870912 |           4 |        4 |      27009769400 |
|        1000 |      1073741824 |           4 |        4 |      32234841632 |
|        1000 |       536870912 |           8 |        4 |      24416650118 |
|        1000 |      1073741824 |           8 |        4 |      30841124677 |
```

```
## parquet_multithreaded_read_decode_chunked_string
| cardinality | total_data_size | num_threads | num_cols | bytes_per_second |
|-------------|-----------------|-------------|----------|------------------|
|        1000 |       536870912 |           1 |        4 |      14637004584 |
|        1000 |      1073741824 |           1 |        4 |      16025843421 |
|        1000 |       536870912 |           2 |        4 |      15333491977 |
|        1000 |      1073741824 |           2 |        4 |      17164197747 |
|        1000 |       536870912 |           4 |        4 |      16556300728 |
|        1000 |      1073741824 |           4 |        4 |      17711338934 |
|        1000 |       536870912 |           8 |        4 |      15788371298 |
|        1000 |      1073741824 |           8 |        4 |      17911649578 |
```

In addition, this benchmark clearly shows multi-thread only regressions. An example case below using the pageable-error-code regression we've seen in the past.

Example without regression:
```

## parquet_multithreaded_read_decode_chunked_fixed_width
total_data_size | num_threads | bytes_per_second |
----------------|-------------|------------------|
      536870912 |           1 |      25681728660 |
     1073741824 |           1 |      26281335927 |
      536870912 |           2 |      25597258848 |
     1073741824 |           2 |      26733626352 |
      536870912 |           4 |      25190211717 |
     1073741824 |           4 |      28117411682 |
      536870912 |           8 |      25805791994 |
     1073741824 |           8 |      27788485204 |
```

Example with regression (pageable error-code return values):

```
## parquet_multithreaded_read_decode_chunked_fixed_width
total_data_size | num_threads | bytes_per_second |
-----------------|------------|------------------|
       536870912 |          1 |      25660470283 |
      1073741824 |          1 |      26146862480 |
       536870912 |          2 |      25040145602 |
      1073741824 |          2 |      25460591520 |
       536870912 |          4 |      22917046969 |
      1073741824 |          4 |      24922624784 |
       536870912 |          8 |      20529770200 |
      1073741824 |          8 |      23333751767 |
```

In both cases, we can see that the single-thread case remains the same but there's a regression in the multi-thread case. particularly with 4 threads.

Authors:
  - https://github.com/nvdbaranec
  - Mike Wilson (https://github.com/hyperbolic2346)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/15585
---
 cpp/benchmarks/CMakeLists.txt                 |   5 +
 cpp/benchmarks/io/cuio_common.hpp             |   4 +
 .../io/parquet/parquet_reader_multithread.cpp | 351 ++++++++++++++++++
 .../cudf}/utilities/thread_pool.hpp           |   0
 cpp/src/io/utilities/file_io_utilities.hpp    |   4 +-
 5 files changed, 362 insertions(+), 2 deletions(-)
 create mode 100644 cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
 rename cpp/{src/io => include/cudf}/utilities/thread_pool.hpp (100%)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 4586a12f466..170cf27b72b 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -256,6 +256,11 @@ ConfigureNVBench(
   PARQUET_READER_NVBENCH io/parquet/parquet_reader_input.cpp io/parquet/parquet_reader_options.cpp
 )
 
+# ##################################################################################################
+# * parquet multithread reader benchmark
+# ----------------------------------------------------------------------
+ConfigureNVBench(PARQUET_MULTITHREAD_READER_NVBENCH io/parquet/parquet_reader_multithread.cpp)
+
 # ##################################################################################################
 # * orc reader benchmark --------------------------------------------------------------------------
 ConfigureNVBench(ORC_READER_NVBENCH io/orc/orc_reader_input.cpp io/orc/orc_reader_options.cpp)
diff --git a/cpp/benchmarks/io/cuio_common.hpp b/cpp/benchmarks/io/cuio_common.hpp
index 3d5be41e25f..6e0b32219ce 100644
--- a/cpp/benchmarks/io/cuio_common.hpp
+++ b/cpp/benchmarks/io/cuio_common.hpp
@@ -39,6 +39,10 @@ class cuio_source_sink_pair {
     // delete the temporary file
     std::remove(file_name.c_str());
   }
+  // move constructor
+  cuio_source_sink_pair(cuio_source_sink_pair&& ss)            = default;
+  cuio_source_sink_pair& operator=(cuio_source_sink_pair&& ss) = default;
+
   /**
    * @brief Created a source info of the set type
    *
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
new file mode 100644
index 00000000000..fbdcfb0ade9
--- /dev/null
+++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/io/cuio_common.hpp>
+#include <benchmarks/io/nvbench_helpers.hpp>
+
+#include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/io/memory_resource.hpp>
+#include <cudf/io/parquet.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/thread_pool.hpp>
+
+#include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/mr/pinned_host_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <nvtx3/nvtx3.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <vector>
+
+// TODO: remove this once pinned/pooled is enabled by default in cuIO
+void set_cuio_host_pinned_pool()
+{
+  using host_pooled_mr = rmm::mr::pool_memory_resource<rmm::mr::pinned_host_memory_resource>;
+  static std::shared_ptr<host_pooled_mr> mr = std::make_shared<host_pooled_mr>(
+    std::make_shared<rmm::mr::pinned_host_memory_resource>().get(), 256ul * 1024 * 1024);
+  cudf::io::set_host_memory_resource(*mr);
+}
+
+size_t get_num_reads(nvbench::state const& state) { return state.get_int64("num_threads"); }
+
+size_t get_read_size(nvbench::state const& state)
+{
+  auto const num_reads = get_num_reads(state);
+  return state.get_int64("total_data_size") / num_reads;
+}
+
+std::string get_label(std::string const& test_name, nvbench::state const& state)
+{
+  auto const num_cols       = state.get_int64("num_cols");
+  size_t const read_size_mb = get_read_size(state) / (1024 * 1024);
+  return {test_name + ", " + std::to_string(num_cols) + " columns, " +
+          std::to_string(state.get_int64("num_threads")) + " threads " + " (" +
+          std::to_string(read_size_mb) + " MB each)"};
+}
+
+std::tuple<std::vector<cuio_source_sink_pair>, size_t, size_t> write_file_data(
+  nvbench::state& state, std::vector<cudf::type_id> const& d_types)
+{
+  cudf::size_type const cardinality = state.get_int64("cardinality");
+  cudf::size_type const run_length  = state.get_int64("run_length");
+  cudf::size_type const num_cols    = state.get_int64("num_cols");
+  size_t const num_files            = get_num_reads(state);
+  size_t const per_file_data_size   = get_read_size(state);
+
+  std::vector<cuio_source_sink_pair> source_sink_vector;
+
+  size_t total_file_size = 0;
+
+  for (size_t i = 0; i < num_files; ++i) {
+    cuio_source_sink_pair source_sink{cudf::io::io_type::HOST_BUFFER};
+
+    auto const tbl = create_random_table(
+      cycle_dtypes(d_types, num_cols),
+      table_size_bytes{per_file_data_size},
+      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+    auto const view = tbl->view();
+
+    cudf::io::parquet_writer_options write_opts =
+      cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
+        .compression(cudf::io::compression_type::SNAPPY)
+        .max_page_size_rows(50000)
+        .max_page_size_bytes(1024 * 1024);
+
+    cudf::io::write_parquet(write_opts);
+    total_file_size += source_sink.size();
+
+    source_sink_vector.push_back(std::move(source_sink));
+  }
+
+  return {std::move(source_sink_vector), total_file_size, num_files};
+}
+
+void BM_parquet_multithreaded_read_common(nvbench::state& state,
+                                          std::vector<cudf::type_id> const& d_types,
+                                          std::string const& label)
+{
+  size_t const data_size = state.get_int64("total_data_size");
+  auto const num_threads = state.get_int64("num_threads");
+
+  set_cuio_host_pinned_pool();
+
+  auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
+  cudf::detail::thread_pool threads(num_threads);
+
+  auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+
+  nvtxRangePushA(("(read) " + label).c_str());
+  state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+             [&](nvbench::launch& launch, auto& timer) {
+               auto read_func = [&](int index) {
+                 auto const stream = streams[index % num_threads];
+                 auto& source_sink = source_sink_vector[index];
+                 cudf::io::parquet_reader_options read_opts =
+                   cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
+                 cudf::io::read_parquet(read_opts, stream, rmm::mr::get_current_device_resource());
+               };
+
+               threads.paused = true;
+               for (size_t i = 0; i < num_files; ++i) {
+                 threads.submit(read_func, i);
+               }
+               timer.start();
+               threads.paused = false;
+               threads.wait_for_tasks();
+               cudf::detail::join_streams(streams, cudf::get_default_stream());
+               timer.stop();
+             });
+  nvtxRangePop();
+
+  auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size");
+}
+
+void BM_parquet_multithreaded_read_mixed(nvbench::state& state)
+{
+  auto label = get_label("mixed", state);
+  nvtxRangePushA(label.c_str());
+  BM_parquet_multithreaded_read_common(
+    state, {cudf::type_id::INT32, cudf::type_id::DECIMAL64, cudf::type_id::STRING}, label);
+  nvtxRangePop();
+}
+
+void BM_parquet_multithreaded_read_fixed_width(nvbench::state& state)
+{
+  auto label = get_label("fixed width", state);
+  nvtxRangePushA(label.c_str());
+  BM_parquet_multithreaded_read_common(state, {cudf::type_id::INT32}, label);
+  nvtxRangePop();
+}
+
+void BM_parquet_multithreaded_read_string(nvbench::state& state)
+{
+  auto label = get_label("string", state);
+  nvtxRangePushA(label.c_str());
+  BM_parquet_multithreaded_read_common(state, {cudf::type_id::STRING}, label);
+  nvtxRangePop();
+}
+
+void BM_parquet_multithreaded_read_list(nvbench::state& state)
+{
+  auto label = get_label("list", state);
+  nvtxRangePushA(label.c_str());
+  BM_parquet_multithreaded_read_common(state, {cudf::type_id::LIST}, label);
+  nvtxRangePop();
+}
+
+void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
+                                                  std::vector<cudf::type_id> const& d_types,
+                                                  std::string const& label)
+{
+  size_t const data_size    = state.get_int64("total_data_size");
+  auto const num_threads    = state.get_int64("num_threads");
+  size_t const input_limit  = state.get_int64("input_limit");
+  size_t const output_limit = state.get_int64("output_limit");
+
+  set_cuio_host_pinned_pool();
+
+  auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
+  cudf::detail::thread_pool threads(num_threads);
+  auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+
+  nvtxRangePushA(("(read) " + label).c_str());
+  std::vector<cudf::io::table_with_metadata> chunks;
+  state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+             [&](nvbench::launch& launch, auto& timer) {
+               auto read_func = [&](int index) {
+                 auto const stream = streams[index % num_threads];
+                 auto& source_sink = source_sink_vector[index];
+                 cudf::io::parquet_reader_options read_opts =
+                   cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
+                 // divide chunk limits by number of threads so the number of chunks produced is the
+                 // same for all cases. this seems better than the alternative, which is to keep the
+                 // limits the same. if we do that, as the number of threads goes up, the number of
+                 // chunks goes down - so are actually benchmarking the same thing in that case?
+                 auto reader = cudf::io::chunked_parquet_reader(
+                   output_limit / num_threads, input_limit / num_threads, read_opts, stream);
+
+                 // read all the chunks
+                 do {
+                   auto table = reader.read_chunk();
+                 } while (reader.has_next());
+               };
+
+               threads.paused = true;
+               for (size_t i = 0; i < num_files; ++i) {
+                 threads.submit(read_func, i);
+               }
+               timer.start();
+               threads.paused = false;
+               threads.wait_for_tasks();
+               cudf::detail::join_streams(streams, cudf::get_default_stream());
+               timer.stop();
+             });
+  nvtxRangePop();
+
+  auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size");
+}
+
+void BM_parquet_multithreaded_read_chunked_mixed(nvbench::state& state)
+{
+  auto label = get_label("mixed", state);
+  nvtxRangePushA(label.c_str());
+  BM_parquet_multithreaded_read_chunked_common(
+    state, {cudf::type_id::INT32, cudf::type_id::DECIMAL64, cudf::type_id::STRING}, label);
+  nvtxRangePop();
+}
+
+void BM_parquet_multithreaded_read_chunked_fixed_width(nvbench::state& state)
+{
+  auto label = get_label("mixed", state);
+  nvtxRangePushA(label.c_str());
+  BM_parquet_multithreaded_read_chunked_common(state, {cudf::type_id::INT32}, label);
+  nvtxRangePop();
+}
+
+void BM_parquet_multithreaded_read_chunked_string(nvbench::state& state)
+{
+  auto label = get_label("string", state);
+  nvtxRangePushA(label.c_str());
+  BM_parquet_multithreaded_read_chunked_common(state, {cudf::type_id::STRING}, label);
+  nvtxRangePop();
+}
+
+void BM_parquet_multithreaded_read_chunked_list(nvbench::state& state)
+{
+  auto label = get_label("list", state);
+  nvtxRangePushA(label.c_str());
+  BM_parquet_multithreaded_read_chunked_common(state, {cudf::type_id::LIST}, label);
+  nvtxRangePop();
+}
+
+// mixed data types: fixed width and strings
+NVBENCH_BENCH(BM_parquet_multithreaded_read_mixed)
+  .set_name("parquet_multithreaded_read_decode_mixed")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
+  .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8});
+
+NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width)
+  .set_name("parquet_multithreaded_read_decode_fixed_width")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
+  .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8});
+
+NVBENCH_BENCH(BM_parquet_multithreaded_read_string)
+  .set_name("parquet_multithreaded_read_decode_string")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
+  .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8});
+
+NVBENCH_BENCH(BM_parquet_multithreaded_read_list)
+  .set_name("parquet_multithreaded_read_decode_list")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
+  .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8});
+
+// mixed data types: fixed width, strings
+NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed)
+  .set_name("parquet_multithreaded_read_decode_chunked_mixed")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
+  .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8})
+  .add_int64_axis("input_limit", {640 * 1024 * 1024})
+  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+
+NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width)
+  .set_name("parquet_multithreaded_read_decode_chunked_fixed_width")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
+  .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8})
+  .add_int64_axis("input_limit", {640 * 1024 * 1024})
+  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+
+NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string)
+  .set_name("parquet_multithreaded_read_decode_chunked_string")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
+  .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8})
+  .add_int64_axis("input_limit", {640 * 1024 * 1024})
+  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+
+NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list)
+  .set_name("parquet_multithreaded_read_decode_chunked_list")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
+  .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8})
+  .add_int64_axis("input_limit", {640 * 1024 * 1024})
+  .add_int64_axis("output_limit", {640 * 1024 * 1024});
diff --git a/cpp/src/io/utilities/thread_pool.hpp b/cpp/include/cudf/utilities/thread_pool.hpp
similarity index 100%
rename from cpp/src/io/utilities/thread_pool.hpp
rename to cpp/include/cudf/utilities/thread_pool.hpp
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index 74a2ae53961..91ef41fba6e 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -17,10 +17,10 @@
 #pragma once
 
 #ifdef CUFILE_FOUND
-#include "thread_pool.hpp"
-
 #include <cudf_test/file_utilities.hpp>
 
+#include <cudf/utilities/thread_pool.hpp>
+
 #include <cufile.h>
 #endif
 

From 9a0612b3add9c76ea8cb45cc230b75b2474d91f7 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 21 May 2024 17:09:00 -0700
Subject: [PATCH 256/272] Fix row group alignment in ORC writer (#15789)

Closes https://github.com/rapidsai/cudf/issues/15775

ORC writer encodes null mask bits in multiples of eight to avoid issues with other readers reading partial encoded bytes. When this does not align with row groups, the null mask encode boundaries are moved to align to multiples of eight. There was a bug in the alignment code that caused a pointless shift by 8 bits and, then, issues in encode. This PR fixes the unnecessary shift.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15789
---
 cpp/src/io/orc/writer_impl.cu                    |  12 ++++++++++--
 .../data/orc/TestOrcFile.MapManyNulls.parquet    | Bin 0 -> 6353 bytes
 python/cudf/cudf/tests/test_orc.py               |  13 +++++++++++++
 3 files changed, 23 insertions(+), 2 deletions(-)
 create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.MapManyNulls.parquet

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 750a593920c..344e216cdc8 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -782,8 +782,16 @@ std::vector<std::vector<rowgroup_rows>> calculate_aligned_rowgroup_bounds(
         } else {
           // pushdown mask present; null mask bits w/ set pushdown mask bits will be encoded
           // Use the number of set bits in pushdown mask as size
-          auto bits_to_borrow =
-            8 - (d_pd_set_counts[rg_idx][parent_col_idx] - previously_borrowed) % 8;
+          auto bits_to_borrow = [&]() {
+            auto const parent_valid_count = d_pd_set_counts[rg_idx][parent_col_idx];
+            if (parent_valid_count < previously_borrowed) {
+              // Borrow to make an empty rowgroup
+              return previously_borrowed - parent_valid_count;
+            }
+            auto const misalignment = (parent_valid_count - previously_borrowed) % 8;
+            return (8 - misalignment) % 8;
+          }();
+
           if (bits_to_borrow == 0) {
             // Didn't borrow any bits for this rowgroup
             previously_borrowed = 0;
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.MapManyNulls.parquet b/python/cudf/cudf/tests/data/orc/TestOrcFile.MapManyNulls.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..a80ce5fbd25837e1e25991dd14e9d2bed849c651
GIT binary patch
literal 6353
zcmaLcd0Z36qX6)5#So7I0)~hRDT$i}5fm>(E&)SSL<EGWXd(&8ilBg26bwX&Ve3Hx
ziOFg;S(9~lyzs&c%g;h9RjajHZ>o6Vg;H-k%3FTFzTf-f{oc!GKiSFb?99&0?#z5Q
zsc~uHKCV9AKHf9vohyUk#9%Pu`@Z6Q7(UO#=`F<P#5O1TAk&A*V9+TT^iRfZcLqak
zM^OZ&bV?vI*y|0#OcK@uIJaIWw&{cw*#Kn=*cJ}kX12n-0B2JbnXGiz3EZ(%o0Mlx
zO=c@3kVXlT*peD0Y<34oa-~KlCkP1zA$C1ihGYUI#X|r?M_FM#0;4)u;)M|a7Xkxn
z)n=-d2(TFd;9DgmB!z6afbERSDFCBEL_8#N$JqoKs8nW>np}<vXjwdJ5zMOf$^rmb
zV*{`tVAU9KvmIAbumUCs4Mv(#*o279frdaKi4X|KiW4YB2z?2MLvL`f{k&09YEhY!
zV69O}sFf&cH5t+gX`qgfxU-n<xWo>@7$?cZ$5DQoO(;gHAYGC$7f&<ds3g}x=)5yg
zfdB&#ViaRs7%|#-2CF+32LY>`LfBlfm}><Z0&rB27C6&kfN+5hhedF9s?5)7C1D<6
zsK7yzoek97Tnx)ZNz9mP<q|*zBX%Q&!I-_8BDqGEN#zVE9OG7!8b=b$Lrk-^fnHdR
z8L4$r2}$>KfDtMi@ZfVRStQ2JCIkW_q0g2JQCx}hfXxKB6pCge3b?vTWmoF5Sg@-=
zhm(YoM_;VU%T4XDK?(@x)Ud4vp&*mW3XnmGlotxQ8FZPX(f}j`T9IiZK$1*FAifD4
zQlAD@=qaM!!j#z|2M=ba`zbUmDWXe&DJ&UvLjyfAHZzl5;VCABSrtBMNpz2s9wxmz
zK<Ub%sKB0`gt!V3B3D4DjT()Q8=S}$@e<6^3J1WfVoWOzqBI$(#y5zE`x*jxOn|4>
z)EI&|u!`<uhy#R#1*gnBDh-ehzE~LK=B#taDV(g~NdqWn7$7{7G?S8ek_7RHTxSwe
zV!k-b0$01a`6=TqvuzIGjN)7XtC32GKoE*jNKr&Go0U|Z<jGLf+OZ%U#%pn<QIF7#
zAQ75L3#zKnQZ_gl)}fR&(_0TSC<;exxWk44jwlhqn3y85USQ|dh*UVi6DbqyRbq`e
z+qH&rhHP54U2FvinJvkpoPY{))bgDJ3~CbMyPGY68Xr<;NR|M!3TK(PN-m(7)u7%F
zqVF)bY@}WzQ}8_`$&`q+N&pPRm_ux3RmuEh1Wu|Xnk<Ab4geZpU<3uiDt1y5EU}^l
zic)|ydq8QFMyDZaL|n04gD1mwUyYP<;{iD#NmAR)nI4(e1p2YC14xs~O^da{IEw31
z;YxWL!A$@x3ScBAOw#HwQY-aCFgHw$nUqPfR@6_a4A40soHW;HD7KJ;D{ug+<SLF>
z2?*R(QOm=eGJq0yPbDiczJPK#f()8ufB}V^4Wn6n*jJ4y$a*5Ks#>6Bah#KJy3e$*
z4+vl;vn&+fFV!6gjW%-yh1Mu6vW#?{5q4t(Ukd^3hImo}DFijP3M))eFi8moAgCq_
z)f**%8E7FfMH*4XQ^+VTTa_TtX4w}xxDsnRM_+5MmCG`<+!|*Atj4ogzyK&1HDTIp
z)B@u+Dx0!V04E5egv9t>T*PJ*<;qCdoWz36kdEmr)M#Ldss^T*Z~%R-oYE*~8(^hW
z133sU9MaTa*+h-Q0Z4&Nk8?0_G6}T4I<SZbq3LL@UY5b5Y!J_6slpL;K(2=k0rV&t
zh`K99$!4vNN=wedNR`41He0M@mPCfKR0$4TLiz%;H^x?(9Rw>O$P#Fi5MlsOSzvZN
z0U0oFrHqXe4nmQLr~<^^mKs2JKTkrq(lv7y8}zu1fODyICzUdR@-;bx1T3_e)F=m}
zVi>ntLb+240wwHPmQI2O0fHyao{cC85yL};#yW)Y3N5MN#*!MbM2BdIY}`)=bQ$Jq
zV3i_5N~%di5==>{Kq!GR6ABYtO>R0+LJCZ%G>A;D?qm|A0wza>QfUW)6q-QER0ddP
zQ7bVj&1B)2RXCZ;)-e%{n1|UJVjaff==>nU&W25pK_-SU#EaltF+ml{m(|<2sv4#l
zP5`yQBtl`bTIL~?AgPN~dU_xLK+KilHmuqb3u7oiUAZj!nlJ%iBN_TC%p}2Ts&O3C
z^Q(lo8qgzJ6$Vk>L0XH_2&^{5kN~A(HZLzfF|gPwfJuwdlS($9zziyrLJ8o?8X)2T
z*e8~xM2ILrh*FdiLbPZNuCY2HY(#HG0fvh$c$|vi>MIr!QnN;El!!@ltVoocL_m~`
zn<1#w3P{YX#;w&Pz)dK-$}U2=d`B{gSEB|CJKh3drI%GjA{J*Wx(JXe2M)xz9e8L&
zY4kK{MKh^DaW*Q4>J0%zkU3Cak1YbuI#NTz%vgX*GzJXzaMb_^lNd}^S$s0&8>rya
z6Kt<~bCtG|NlJtQR1M@_x!gpXA{S(<;;m-F#Ad@>07yG&%K{85WuvEmoB}E&fs`dz
z5vf{IfWuYUOo0IA!%Udu^MppAwOH+_#KKdF<Cv5e&oWygpq!~{F>Zk!az1XRv_v2X
zWPAEKq%e^vB=qzc;$;<Lh<bJ{Txlk8fgP?f!7|*g5~1u|(u&IgA~TqAh9(}iIVc4R
z?1U+iiU$@T69Pm6a9E;H2w@ElYH^3Yrn25l_~Gn82|Y*gVQxI09dE<gz!96ws|5l+
z>}=AQ74+ci<)CMTz6S<Jxw#~z#%=Ts3YZc^1&|DYv&?Zq*e<mCSsk#|y$aZaloB6<
zO2u)ng&_lq$QCJuFljRDC^ITUIc&340E|_9!be-53F0*Zj6jY2WV-PaWrUN>7MN)O
zn35^V5;zd|TA4zd4SVwGp7d1Ia?yIkP>8v^(W4fqQr3d{8eTRLL{x&>K#yvs#SVPo
z3|N`X=W1(V5eQNNx^_4mD~AOfRAn*1q&e4FNa1dXwh9x`k6V%jrAHwE;Q&ixnmw}s
z#U?2n;>05kok|Gfgvr@S-In-^ZQ0~D&ZE@iJMBfvD{rQq5LG7LdAcR@c*M{v?_aSm
zML3`24Q~HzSTee)FnF66w&YdYt@f@C=I3R9g-Fy-JdP|{x8u9|!<wxx-`$QW$=Ftr
zT(-6Cv#fAh*{Vm3^V#Mtl~cdnu(9d9EWEV-N4wXEhJ?7mzZCPQ+``u!4_5czbVvL)
zuKC01$oj8We05B6&FRwNlNp`1#p#XnBAnlOj~neB@n+R9N!i^4wS`kQrtLZ2O!=H3
z9qGHy%<lPdi~7*$&EFIs)1COWr8mvDFtg2Oyu2jp*u@vzhOQ%z0(_@G;13^fQ09mp
z6#UV9{=0tX<ueq&mB9BqJ#ve>TY@{(FN4Ecs@HUUH}<#BNAuK=Z;op$zkRm(-pv~#
zUYqFW<4F$~P2InqE4n>27F&5?`hg_jZ!Yb@yNg&n{^Xk3tEK82{gy4%Emrz+S2Rwz
zT9|wFY&U=NuE#NtUGSLDp!Uosqgq?n4tsL6DkMO={yOuwOW!YfXq?NP8}{wCA2K&b
z2-NbK{0na;b)262$1krFrUkUl&l?(U+j~VYrlq~Y>AB_Io&1q&2cOz%&p$DDU-X-z
z?iayN2TcxI+rM+ZsXm9hR_(J?+5V51k(K^Er=1^9im*!WUs2TdANq{#IsBy4$t~k7
z<ia0I?&a_Ho&?>jx_r~n)uPz5Eh_dIan-)=Ww>3v;^=a9`Ki$zCq`e7%M%_NG-SAJ
zYMyb>hwAlby9T~Cs$;#IveHbm2Kyd4#9$M%r<VROzV@TIu57e&%`)>?S9MiG^61R1
zyB2)<^W8U%nuc%ZvOKRwEF%kabIe^U*GZSOzsTc7t=X`5Tx;*__MZ81FN5$6y=tc_
zTTlO8lbz%I2CWny3CRpPeeY=@gVqPjhc&~;xlbs4AKE6@N7CE>W0Or6)}UQ&`G3g_
zhp!lfbDKtA+7Fl1w@tU5r){$TB3sb^JAd|p>vv<m4s+Sk+B7qNpl-rIEVV;AmbOX%
zyoJ!ZvfgRzoSOWY>+UBCX7<_S=&qces5OT#4KGs9J-Mo9Ou;k8`Qvw^^=qS1-Q=9X
zciz3cH|1m<+Mu1QUbW<t?Kkj|-#za+Bn|1<Ig;)2YDLe-m~%mm@li9}@ArTIJW#o6
zZc|H%|CX2Q2aKT>#jt7zXFUU-tg&G`BEG+?xqQfc_*Gd?bY9H6&UJV8m@h`tHn|;s
zIb8U-@SWPf;Z@_=p2ktWoo^lb-yDPdyw5$SZE`zlwAQ%ly}In|xo%TkdgJoIwnsSw
zS^eC1N*+W--}Ik$jsN-rf6kwyQVLt6&QF_?i_bB>-ZZb|A#vV&?pfL<`$GzhDR+jB
zE;JtC)!iL<GcKyRLEf|WtKrjU4YlrWc@O-{Lw<VZGSmv5E*bG`!_LSVkqc6b{TlA>
zfyW#*RNOzA^nkrzJ-p-(Z0_05+l{H++@rh8LsuU*?!6Pd{c!cEJ;Q%zT~)cfc!J*<
zLEGfvw9qlgmf7nAy>9-zt)SMabkd%I>CJz1mfD_q@JE~;@c35$1)YNN#`pRKE2eCE
zmH&FozCW*m;it}CT(Bl@a&^_x!};gu4+wS@w7vS=sT>p4Y*h_*Qy)1Hl?NU4ulcIM
zFlu~e*Zi=esKmWq=tQ^6ZJE#W>ze1i@3YCS7t>@*kE(qW{F^S#nOJe@`1|@B{fh0-
zk+H#p(uXO2U@b3bOPJiXW9B(GG~DwKHgj&DO&*P!{|}!z(;_$No_k!q<o?$=xZp6l
zX##DNHLg>Zyek>mss4Q(ZIjT%5u&dX%*+4rb~n~N3p<k%M6JuX&@v=w?7<<I!+8C^
zznA)RO<e>ha%ujl=n((jDeo-zd#;b6(mp+R@TZT8aC2#MKe+e)v0uKHhV{4v|Al=0
z{PFG2Iq{D_pNv<_+qO*klEP1!-wyxi<~FP>zJ2zAw;x^5S3`FPs|!v~Ji26c&nboU
zqI%)%0n@6d(Kh+@!Oe(+k*{kHt@>TRCX>w1nPPr?<6LRQPp3MfXHIVWwx|1_Hu=)?
zdi8<V5C7c0zt1K+`rXP)yfWeN*T0CC9(wmRZIcI8;+7QJCZ}AW1KR!%W;HxRJ};1D
z^*DX63VB$|ccE?aJcK{+_IKeLpU1ko<eZP^TJH}qjVK%5`=#HHAM5?1Ztah9Td;(-
z$v1qDoS9vs57wM#k<X@Yf0)w1YAqVJxbgnurIMEUYJN)C?jOK!v`t2rvZqZ8mfXE~
z6usEBZ;#m{YR;E+=UX~|?z2hDy_xWXA?ub?Zew1qI69BE$?drwbMGb`efZYCf1)OJ
z=$zPw=+_tTZ%B<fEl)c-BB6Hx^<u~KTT_zOpU0ZcNQbSt<F++r$F)^Su?-i7m6e6H
zJ%nC8zrL|$?^l!j@w|ik_EpJCIVt%k8h5(p)sEfjl+m1j$UXH~<#zsry?r+MVdhKh
zuRJ+>^2MxikwetmS2oqZi)*c(aE9gYLfhoAdHrag7h%kq1HS7SN86-#lCk4@#glVC
z=UEnKq;D`zc4O}wa^pR)*G+9&7`1YI&7TzwlaKcVKc<8uPXuk8e^zf?_!l$qyJI)=
z(i&TNdBvxOvP%<2uFrb2=<SGkf7@h5apsv38(jpEYJC2H-IqaE@a8Sp`UT(K2z_lk
zwV><IuHA<^-~T?Z=_cQ6#OebFW|R~ii>7T7bvtAO(?!_Q0xvADnN0PI4(q)P<XtUg
zeKuLHnfUPi;6qT$#;w_nKRb>zH%%zg-{)+fHS_sNt?!19i^f0~(l3-;D$W?wa{&7K
z>W$XdYt&)69tR4ZKAN|ewn@Gy`L<T~WlnO|O|pFD;Aq+=d*<g9qXSA4j=Wy@VaSVZ
z-nZE0w;wjL5+d5eJ8~YS>~22SXOp87>$ZBYl6|$a)A1-Tv2bZvig)|Q6L&8VV<o#>
zC!gKol2WI?-|+}8IhF2InxAld$hEh^_mM)&HT7oB)6_e+hF9l&asRS%*Tr3L&am1f
zn<Dl%_Ss~Kau;or^0TpTC&hoW{=>?lu}zxklJl!IkFpO9+i<ci<9hYW2I-PXPnK)u
zEJ-j;OMN*dCm1trrft%ao8GI8QrsF<c6r7%*MZNcLFM0HJCd<V`Jgwv=WGJCxcqYT
z;&!!i>)tt6E&kie?|mp4biQA~x|Ffo4>M++-^$`_7#7uO|0>ut@$%wXA$JM~H}dap
zpV0bYoUFcMX!%B_IXGeWuf1DloP$67DD#*yZ*A(bQvLUPj=VTAYedn#j;+-<Vq<ok
z+qcPe1ZyF@eaFb=ulj8A$F+q!e*SUi&aJdfCf!r-?|9VPib1EmBl>J|qWjO)muZ{a
z>=;bj<mrtA6(6szzj^uI_3Uw-TQ93jE0xW`l|O$HalW7I6Wi1!LS_UAzkTvWIdRRI
z3$#tTo!0hOU%Gqt<c@<W-yQ6;$&i#u@0V{|H~n`>%ARepGiJ<8YJ|SXMr4dX{nN<j
z+pnt+T7P`JAK&urpsF{b&BJM%e7;w`xvqWC)Eg(FR}H>f-h{L}cb{GM^m(b{^@)_%
zeKwhqR#wp5eKxZzJSBUxJfyxZ>?m!MOIPkJ>UGvNX^w^Vi#>TUr{%X>!5*$Zyo|at
zukKp+;;G3|L&8`3Upd;}>x#MRr_=q8EgUuBqIw@~lM#s>-{gV?rBfTOcbfY5kLJGo
zi+s>@AnPNeo9fVijK+^Qm##@Twm7-<UFv#L@G7P?yPrT1GY)1>KE@wA>i17)ZZz00
zNZQ9PogUZk$>6c=vlr4fIbNM|&=UD}1%K!DF72-Hec}B*hdGwV>`EMURKEdh8f=}k
zy-+>-*U^=?kI^;>-DrwjeDIA*bZ$@ydH0~aceTuL`xWDdsj%(gla&{m5AWKNcq;yx
zrFZd^mx$!ktLTLa#iY@iS08`9cSK)vy3Zy%+2N9pIlIP14xf5xLA>oy|4$ENXYzh+
z9?u``a_xufd0$+jyc{u~*3N2fzVK?(&|Gq_`rd)&e5jW>Z;5JmLB<Juz_z$iqaRFa
zpluRE+vMcoh_W1?I#nrclNhM(+trH~l}(!5bnW{29y3~ua=yCWF!H6bbzG;<`YUrk
zO>G(Z$0PK2+9n@2Cl+-&{dpiT^YYVM`Ta!mM;<A3uPsviv>?WF+_GNMGw;{o7p|<W
zEBs@iTX*ECleA4fUD-Hx!K0wY%6+xr9@YIUZ=Mc`$%rXiFmA+|+F;ryrQi@d@P2UY
zikJ!hn^ZS0M}F8=;xg(9ZIks?*o>Rk(4-sZ_SZdoG4cM#?P<*0o>?cKu=ni6qc~HM
zw;Ps@ih13+#o6gB#PD(Un=fJ(FILKzN`0Ie<D5Az3m1pc3q(5X%5h&JFI~7&n!iF$
zFBp9W@EIHzI)=}f;orA*9Kd(xxc&3!-!Z=Q8*M*-zYqb#-)Ddy-<9b%u`jp($))dX
zA4Xp?-_|koCKxYZ`Hv@OKu+--r#MefhF?hE+jF{bdSTkPV=(^Dy#7`4-?`Z#Uppjt
zF-CBHjQ{xVVu-<E{KwpUA4~d57qIAnH-j#YBP=Y+3zaUD(k&2Lyi8iOB(!*WekdJS
zzCvCST2%N==u&x!G*?<8<yf88@JdRT$$62y;*z2ji%WPRJcUZ0pIgj}oVSKoAYIyb
zh^{al|F3laPQO&TjE<yhpZ@RjRRwu;)IXc=KSoY1EGUtx3W`%!<mdAu>GI^|A-n?m
zhg6nNm(v#!!u#)Q<3+A1FJDm3;Bfv|bymuYidBUL9JjHd5usxk9G}H2auor8#|Dgv
Z2n`z_8W!dkLU(CjuX@sro=X3j@n1^C^eO-V

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 83b7353ad89..b83b8f08a8b 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1954,3 +1954,16 @@ def test_writer_lz4():
 
     got = pd.read_orc(buffer)
     assert_eq(gdf, got)
+
+
+def test_row_group_alignment(datadir):
+    path = datadir / "TestOrcFile.MapManyNulls.parquet"
+
+    expected = cudf.read_parquet(path)
+
+    buffer = BytesIO()
+    expected.to_orc(buffer)
+
+    got = cudf.read_orc(buffer)
+
+    assert_eq(expected, got)

From 217c73f7d34c84f786707f335b95ed1aae94e87a Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 22 May 2024 11:26:50 -0500
Subject: [PATCH 257/272] Deprecate `Groupby.collect` (#15808)

After we made our groupby fail more aggressively for unsupported types in https://github.com/rapidsai/cudf/pull/15712, `Groupby.collect` started to fail on string column, where this isn't a supported aggregation on string column in pandas and this method doesn't exist in pandas Groupby, hence this PR suggest the alternative equivalent and deprecates the API to be removed in next release.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: https://github.com/rapidsai/cudf/pull/15808
---
 python/cudf/cudf/core/groupby/groupby.py      | 12 ++++++++-
 python/dask_cudf/dask_cudf/expr/_groupby.py   | 26 +++++++++++--------
 python/dask_cudf/dask_cudf/groupby.py         | 23 +++++++++-------
 .../dask_cudf/dask_cudf/tests/test_groupby.py | 14 +++++++---
 4 files changed, 51 insertions(+), 24 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 3e4b8192888..bf24864c29d 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -40,6 +40,15 @@
 from cudf.utils.utils import GetAttrGetItemMixin
 
 
+def _deprecate_collect():
+    warnings.warn(
+        "Groupby.collect is deprecated and "
+        "will be removed in a future version. "
+        "Use `.agg(list)` instead.",
+        FutureWarning,
+    )
+
+
 # The three functions below return the quantiles [25%, 50%, 75%]
 # respectively, which are called in the describe() method to output
 # the summary stats of a GroupBy object
@@ -2180,7 +2189,8 @@ def func(x):
     @_cudf_nvtx_annotate
     def collect(self):
         """Get a list of all the values for each column in each group."""
-        return self.agg("collect")
+        _deprecate_collect()
+        return self.agg(list)
 
     @_cudf_nvtx_annotate
     def unique(self):
diff --git a/python/dask_cudf/dask_cudf/expr/_groupby.py b/python/dask_cudf/dask_cudf/expr/_groupby.py
index 116893891e3..65688115b59 100644
--- a/python/dask_cudf/dask_cudf/expr/_groupby.py
+++ b/python/dask_cudf/dask_cudf/expr/_groupby.py
@@ -9,19 +9,21 @@
 
 from dask.dataframe.groupby import Aggregation
 
+from cudf.core.groupby.groupby import _deprecate_collect
+
 ##
 ## Custom groupby classes
 ##
 
 
-class Collect(SingleAggregation):
+class ListAgg(SingleAggregation):
     @staticmethod
     def groupby_chunk(arg):
-        return arg.agg("collect")
+        return arg.agg(list)
 
     @staticmethod
     def groupby_aggregate(arg):
-        gb = arg.agg("collect")
+        gb = arg.agg(list)
         if gb.ndim > 1:
             for col in gb.columns:
                 gb[col] = gb[col].list.concat()
@@ -30,10 +32,10 @@ def groupby_aggregate(arg):
             return gb.list.concat()
 
 
-collect_aggregation = Aggregation(
-    name="collect",
-    chunk=Collect.groupby_chunk,
-    agg=Collect.groupby_aggregate,
+list_aggregation = Aggregation(
+    name="list",
+    chunk=ListAgg.groupby_chunk,
+    agg=ListAgg.groupby_aggregate,
 )
 
 
@@ -41,13 +43,13 @@ def _translate_arg(arg):
     # Helper function to translate args so that
     # they can be processed correctly by upstream
     # dask & dask-expr. Right now, the only necessary
-    # translation is "collect" aggregations.
+    # translation is list aggregations.
     if isinstance(arg, dict):
         return {k: _translate_arg(v) for k, v in arg.items()}
     elif isinstance(arg, list):
         return [_translate_arg(x) for x in arg]
     elif arg in ("collect", "list", list):
-        return collect_aggregation
+        return list_aggregation
     else:
         return arg
 
@@ -84,7 +86,8 @@ def __getitem__(self, key):
         return g
 
     def collect(self, **kwargs):
-        return self._single_agg(Collect, **kwargs)
+        _deprecate_collect()
+        return self._single_agg(ListAgg, **kwargs)
 
     def aggregate(self, arg, **kwargs):
         return super().aggregate(_translate_arg(arg), **kwargs)
@@ -96,7 +99,8 @@ def __init__(self, *args, observed=None, **kwargs):
         super().__init__(*args, observed=observed, **kwargs)
 
     def collect(self, **kwargs):
-        return self._single_agg(Collect, **kwargs)
+        _deprecate_collect()
+        return self._single_agg(ListAgg, **kwargs)
 
     def aggregate(self, arg, **kwargs):
         return super().aggregate(_translate_arg(arg), **kwargs)
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index 43ad4f0fee3..ef47ea436c7 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -15,6 +15,7 @@
 from dask.utils import funcname
 
 import cudf
+from cudf.core.groupby.groupby import _deprecate_collect
 from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
 
 from dask_cudf.sorting import _deprecate_shuffle_kwarg
@@ -28,7 +29,7 @@
     "sum",
     "min",
     "max",
-    "collect",
+    list,
     "first",
     "last",
 )
@@ -164,9 +165,10 @@ def max(self, split_every=None, split_out=1):
     @_dask_cudf_nvtx_annotate
     @_check_groupby_optimized
     def collect(self, split_every=None, split_out=1):
+        _deprecate_collect()
         return _make_groupby_agg_call(
             self,
-            self._make_groupby_method_aggs("collect"),
+            self._make_groupby_method_aggs(list),
             split_every,
             split_out,
         )
@@ -308,9 +310,10 @@ def max(self, split_every=None, split_out=1):
     @_dask_cudf_nvtx_annotate
     @_check_groupby_optimized
     def collect(self, split_every=None, split_out=1):
+        _deprecate_collect()
         return _make_groupby_agg_call(
             self,
-            {self._slice: "collect"},
+            {self._slice: list},
             split_every,
             split_out,
         )[self._slice]
@@ -472,7 +475,7 @@ def groupby_agg(
 
     This aggregation algorithm only supports the following options
 
-    * "collect"
+    * "list"
     * "count"
     * "first"
     * "last"
@@ -667,8 +670,8 @@ def _redirect_aggs(arg):
         sum: "sum",
         max: "max",
         min: "min",
-        list: "collect",
-        "list": "collect",
+        "collect": list,
+        "list": list,
     }
     if isinstance(arg, dict):
         new_arg = dict()
@@ -704,7 +707,7 @@ def _aggs_optimized(arg, supported: set):
             _global_set = set(arg)
 
         return bool(_global_set.issubset(supported))
-    elif isinstance(arg, str):
+    elif isinstance(arg, (str, type)):
         return arg in supported
     return False
 
@@ -783,6 +786,8 @@ def _tree_node_agg(df, gb_cols, dropna, sort, sep):
         agg = col.split(sep)[-1]
         if agg in ("count", "sum"):
             agg_dict[col] = ["sum"]
+        elif agg == "list":
+            agg_dict[col] = [list]
         elif agg in OPTIMIZED_AGGS:
             agg_dict[col] = [agg]
         else:
@@ -873,8 +878,8 @@ def _finalize_gb_agg(
                 gb.drop(columns=[sum_name], inplace=True)
             if "count" not in agg_list:
                 gb.drop(columns=[count_name], inplace=True)
-        if "collect" in agg_list:
-            collect_name = _make_name((col, "collect"), sep=sep)
+        if list in agg_list:
+            collect_name = _make_name((col, "list"), sep=sep)
             gb[collect_name] = gb[collect_name].list.concat()
 
     # Ensure sorted keys if `sort=True`
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index dc279bfa690..cf916b713b2 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -9,6 +9,7 @@
 from dask.utils_test import hlg_layer
 
 import cudf
+from cudf.testing._utils import expect_warning_if
 
 import dask_cudf
 from dask_cudf.groupby import OPTIMIZED_AGGS, _aggs_optimized
@@ -47,7 +48,13 @@ def pdf(request):
     return pdf
 
 
-@pytest.mark.parametrize("aggregation", OPTIMIZED_AGGS)
+# NOTE: We only want to test aggregation "methods" here,
+# so we need to leave out `list`. We also include a
+# deprecation check for "collect".
+@pytest.mark.parametrize(
+    "aggregation",
+    sorted(tuple(set(OPTIMIZED_AGGS) - {list}) + ("collect",)),
+)
 @pytest.mark.parametrize("series", [False, True])
 def test_groupby_basic(series, aggregation, pdf):
     gdf = cudf.DataFrame.from_pandas(pdf)
@@ -62,8 +69,9 @@ def test_groupby_basic(series, aggregation, pdf):
 
     check_dtype = aggregation != "count"
 
-    expect = getattr(gdf_grouped, aggregation)()
-    actual = getattr(ddf_grouped, aggregation)()
+    with expect_warning_if(aggregation == "collect"):
+        expect = getattr(gdf_grouped, aggregation)()
+        actual = getattr(ddf_grouped, aggregation)()
 
     if not QUERY_PLANNING_ON:
         assert_cudf_groupby_layers(actual)

From 766fbb7f0220960320413ba540ca26ef38591ef1 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 22 May 2024 11:43:16 -0500
Subject: [PATCH 258/272] Add temporary dask-cudf workaround for categorical
 sorting (#15801)

Follow up to https://github.com/rapidsai/cudf/pull/15788

Adds a temporary workaround for sorting on categorical columns in 24.06: We convert only the partitioning column to pandas to calculate divisions.

This is related to https://github.com/rapidsai/cudf/issues/11795, but I don't want to "close" that issue until `RepartitionQuantiles` works with cudf-backed data.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: https://github.com/rapidsai/cudf/pull/15801
---
 .../dask_cudf/dask_cudf/expr/_collection.py   | 19 --------------
 python/dask_cudf/dask_cudf/expr/_expr.py      | 25 +++++++++++++++++++
 python/dask_cudf/dask_cudf/tests/test_sort.py | 23 ++---------------
 3 files changed, 27 insertions(+), 40 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py
index 926b7cfaf0e..d50dfb24256 100644
--- a/python/dask_cudf/dask_cudf/expr/_collection.py
+++ b/python/dask_cudf/dask_cudf/expr/_collection.py
@@ -15,7 +15,6 @@
 
 from dask import config
 from dask.dataframe.core import is_dataframe_like
-from dask.dataframe.dispatch import is_categorical_dtype
 
 import cudf
 
@@ -82,24 +81,6 @@ def from_dict(cls, *args, **kwargs):
         with config.set({"dataframe.backend": "cudf"}):
             return DXDataFrame.from_dict(*args, **kwargs)
 
-    def sort_values(
-        self,
-        by,
-        **kwargs,
-    ):
-        # Raise if the first column is categorical, otherwise the
-        # upstream divisions logic may produce errors
-        # (See: https://github.com/rapidsai/cudf/issues/11795)
-        check_by = by[0] if isinstance(by, list) else by
-        if is_categorical_dtype(self.dtypes.get(check_by, None)):
-            raise NotImplementedError(
-                "Dask-cudf does not support sorting on categorical "
-                "columns when query-planning is enabled. Please use "
-                "the legacy API for now."
-                f"\n{_LEGACY_WORKAROUND}",
-            )
-        return super().sort_values(by, **kwargs)
-
     def groupby(
         self,
         by,
diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/expr/_expr.py
index ff037b9520c..8fccaccb695 100644
--- a/python/dask_cudf/dask_cudf/expr/_expr.py
+++ b/python/dask_cudf/dask_cudf/expr/_expr.py
@@ -1,11 +1,14 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import functools
 
+import dask_expr._shuffle as _shuffle_module
+from dask_expr import new_collection
 from dask_expr._cumulative import CumulativeBlockwise
 from dask_expr._expr import Expr, VarColumns
 from dask_expr._reductions import Reduction, Var
 
 from dask.dataframe.core import is_dataframe_like, make_meta, meta_nonempty
+from dask.dataframe.dispatch import is_categorical_dtype
 
 ##
 ## Custom expression patching
@@ -121,3 +124,25 @@ def _patched_var(
 
 
 Expr.var = _patched_var
+
+
+# Temporary work-around for missing cudf + categorical support
+# See: https://github.com/rapidsai/cudf/issues/11795
+# TODO: Fix RepartitionQuantiles and remove this in cudf>24.06
+
+_original_get_divisions = _shuffle_module._get_divisions
+
+
+def _patched_get_divisions(frame, other, *args, **kwargs):
+    # NOTE: The following two lines contains the "patch"
+    # (we simply convert the partitioning column to pandas)
+    if is_categorical_dtype(other._meta.dtype) and hasattr(
+        other.frame._meta, "to_pandas"
+    ):
+        other = new_collection(other).to_backend("pandas")._expr
+
+    # Call "original" function
+    return _original_get_divisions(frame, other, *args, **kwargs)
+
+
+_shuffle_module._get_divisions = _patched_get_divisions
diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py
index 9d9fe297248..9bbbbc79561 100644
--- a/python/dask_cudf/dask_cudf/tests/test_sort.py
+++ b/python/dask_cudf/dask_cudf/tests/test_sort.py
@@ -10,7 +10,7 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import QUERY_PLANNING_ON, xfail_dask_expr
+from dask_cudf.tests.utils import xfail_dask_expr
 
 
 @pytest.mark.parametrize("ascending", [True, False])
@@ -20,12 +20,7 @@
         "a",
         "b",
         "c",
-        pytest.param(
-            "d",
-            marks=xfail_dask_expr(
-                "Possible segfault when sorting by categorical column.",
-            ),
-        ),
+        "d",
         ["a", "b"],
         ["c", "d"],
     ],
@@ -47,20 +42,6 @@ def test_sort_values(nelem, nparts, by, ascending):
     dd.assert_eq(got, expect, check_index=False)
 
 
-@pytest.mark.parametrize("by", ["b", ["b", "a"]])
-def test_sort_values_categorical_raises(by):
-    df = cudf.DataFrame()
-    df["a"] = np.ascontiguousarray(np.arange(10)[::-1])
-    df["b"] = df["a"].astype("category")
-    ddf = dd.from_pandas(df, npartitions=10)
-
-    if QUERY_PLANNING_ON:
-        with pytest.raises(
-            NotImplementedError, match="sorting on categorical"
-        ):
-            ddf.sort_values(by=by)
-
-
 @pytest.mark.parametrize("ascending", [True, False])
 @pytest.mark.parametrize("by", ["a", "b", ["a", "b"]])
 def test_sort_values_single_partition(by, ascending):

From ad56bc30c53745a43fca0852e4a46e74db988039 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Wed, 22 May 2024 11:48:15 -0700
Subject: [PATCH 259/272] Raise FileNotFoundError when a literal JSON string
 that looks like a json filename is passed (#15806)

- closes #13026

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15806
---
 python/cudf/cudf/tests/test_json.py |  8 ++++++++
 python/cudf/cudf/utils/ioutils.py   | 24 +++++++++++++++++++++++-
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 3033a3e75e3..51287fe26a0 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -355,6 +355,14 @@ def test_json_lines_basic(json_input, engine):
         np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_numpy())
 
 
+@pytest.mark.filterwarnings("ignore:Using CPU")
+@pytest.mark.parametrize("engine", ["auto", "cudf", "pandas"])
+def test_nonexistent_json_correct_error(engine):
+    json_input = "doesnotexist.json"
+    with pytest.raises(FileNotFoundError):
+        cudf.read_json(json_input, engine=engine)
+
+
 @pytest.mark.skipif(
     PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
     reason="warning not present in older pandas versions",
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 18e81078587..dd9b44c5a53 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -1718,10 +1718,32 @@ def get_reader_filepath_or_buffer(
         if _is_local_filesystem(fs):
             # Doing this as `read_json` accepts a json string
             # path_or_data need not be a filepath like string
+
+            # helper for checking if raw text looks like a json filename
+            compression_extensions = [
+                ".tar",
+                ".tar.gz",
+                ".tar.bz2",
+                ".tar.xz",
+                ".gz",
+                ".bz2",
+                ".zip",
+                ".xz",
+                ".zst",
+                "",
+            ]
+
             if len(paths):
                 if fs.exists(paths[0]):
                     path_or_data = paths if len(paths) > 1 else paths[0]
-                elif not allow_raw_text_input:
+
+                # raise FileNotFound if path looks like json
+                # following pandas
+                # see
+                # https://github.com/pandas-dev/pandas/pull/46718/files#diff-472ce5fe087e67387942e1e1c409a5bc58dde9eb8a2db6877f1a45ae4974f694R724-R729
+                elif not allow_raw_text_input or paths[0].lower().endswith(
+                    tuple(f".json{c}" for c in compression_extensions)
+                ):
                     raise FileNotFoundError(
                         f"{path_or_data} could not be resolved to any files"
                     )

From f626ece7e3639c53096bd06ec40cf48ccd807f93 Mon Sep 17 00:00:00 2001
From: William Kaiser <wkaisertexas@gmail.com>
Date: Wed, 22 May 2024 15:16:46 -0400
Subject: [PATCH 260/272] Simplified README Examples (#15338)

Pandas will automatically perform fetches for paths detected as URLs. No need for `requests` and `StringIO`. A marginal change, but this makes the docs a bit simpler.

Authors:
  - William Kaiser (https://github.com/wkaisertexas)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15338
---
 README.md | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 205e16ea0e5..75ee405bc1f 100644
--- a/README.md
+++ b/README.md
@@ -14,13 +14,8 @@ You can import `cudf` directly and use it like `pandas`:
 
 ```python
 import cudf
-import requests
-from io import StringIO
 
-url = "https://github.com/plotly/datasets/raw/master/tips.csv"
-content = requests.get(url).content.decode("utf-8")
-
-tips_df = cudf.read_csv(StringIO(content))
+tips_df = cudf.read_csv("https://github.com/plotly/datasets/raw/master/tips.csv")
 tips_df["tip_percentage"] = tips_df["tip"] / tips_df["total_bill"] * 100
 
 # display average tip by dining party size
@@ -36,13 +31,8 @@ supported operations and falling back to pandas when needed:
 %load_ext cudf.pandas  # pandas operations now use the GPU!
 
 import pandas as pd
-import requests
-from io import StringIO
-
-url = "https://github.com/plotly/datasets/raw/master/tips.csv"
-content = requests.get(url).content.decode("utf-8")
 
-tips_df = pd.read_csv(StringIO(content))
+tips_df = pd.read_csv("https://github.com/plotly/datasets/raw/master/tips.csv")
 tips_df["tip_percentage"] = tips_df["tip"] / tips_df["total_bill"] * 100
 
 # display average tip by dining party size

From 45dc595945301f4076e66ec54a6e4de0b539cfb0 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 22 May 2024 14:17:05 -0500
Subject: [PATCH 261/272] Deprecate `divisions='quantile'` support in
 `set_index` (#15804)

Using `set_index(..., divisions='quantile')` is not supported by dask-cudf when query-planning is enabled. However, this option doesn't seem to serve a purpose anymore. This PR deprecates the option in general.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

Approvers:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: https://github.com/rapidsai/cudf/pull/15804
---
 python/dask_cudf/dask_cudf/core.py             |  8 ++++++++
 python/dask_cudf/dask_cudf/expr/_collection.py | 18 ++++++++++++++++++
 python/dask_cudf/dask_cudf/tests/test_core.py  |  4 ++--
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index 3f0cfeb6d2c..3bd455a3a57 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -167,6 +167,14 @@ def set_index(
         pre_sorted = sorted
         del sorted
 
+        if divisions == "quantile":
+            warnings.warn(
+                "Using divisions='quantile' is now deprecated. "
+                "Please raise an issue on github if you believe "
+                "this feature is necessary.",
+                FutureWarning,
+            )
+
         if (
             divisions == "quantile"
             or isinstance(divisions, (cudf.DataFrame, cudf.Series))
diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py
index d50dfb24256..f60e4ff81ef 100644
--- a/python/dask_cudf/dask_cudf/expr/_collection.py
+++ b/python/dask_cudf/dask_cudf/expr/_collection.py
@@ -81,6 +81,24 @@ def from_dict(cls, *args, **kwargs):
         with config.set({"dataframe.backend": "cudf"}):
             return DXDataFrame.from_dict(*args, **kwargs)
 
+    def set_index(
+        self,
+        *args,
+        divisions=None,
+        **kwargs,
+    ):
+        if divisions == "quantile":
+            divisions = None
+            warnings.warn(
+                "Ignoring divisions='quantile'. This option is now "
+                "deprecated. Please use the legacy API and raise an "
+                "issue on github if this feature is necessary."
+                f"\n{_LEGACY_WORKAROUND}",
+                FutureWarning,
+            )
+
+        return super().set_index(*args, divisions=divisions, **kwargs)
+
     def groupby(
         self,
         by,
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 18a9e3b496f..7f8a619ae22 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -231,7 +231,6 @@ def test_set_index(nelem):
         dd.assert_eq(expect, got, check_index=False, check_divisions=False)
 
 
-@xfail_dask_expr("missing support for divisions='quantile'")
 @pytest.mark.parametrize("by", ["a", "b"])
 @pytest.mark.parametrize("nelem", [10, 500])
 @pytest.mark.parametrize("nparts", [1, 10])
@@ -241,7 +240,8 @@ def test_set_index_quantile(nelem, nparts, by):
     df["b"] = np.random.choice(cudf.datasets.names, size=nelem)
     ddf = dd.from_pandas(df, npartitions=nparts)
 
-    got = ddf.set_index(by, divisions="quantile")
+    with pytest.warns(FutureWarning, match="deprecated"):
+        got = ddf.set_index(by, divisions="quantile")
     expect = df.sort_values(by=by).set_index(by)
     dd.assert_eq(got, expect)
 

From 57444ed421fbddeacf4f9919ff53c1225eb977dd Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 22 May 2024 10:04:07 -1000
Subject: [PATCH 262/272] Access `self.index` instead of `self._index` where
 possible (#15781)

Since `index` is defined as

```python
@property
def index(self):
    return self._index
```

Get and set to `self.index` when possible. Setting to `self.index` ensures that we may not be creating an invalid `IndexedFrame` with a `len(index) != len(columns)`.

There are times when still setting `self._index` was necessary because some data was being swapped "inplace" and validation needed to be avoided. (Hoping to avoid this pattern in the future)

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15781
---
 python/cudf/cudf/core/dataframe.py          | 216 +++++++++++---------
 python/cudf/cudf/core/groupby/groupby.py    |  10 +-
 python/cudf/cudf/core/indexed_frame.py      | 167 ++++++++-------
 python/cudf/cudf/core/join/_join_helpers.py |   4 +-
 python/cudf/cudf/core/join/join.py          |  12 +-
 python/cudf/cudf/core/reshape.py            |  10 +-
 python/cudf/cudf/core/series.py             |  56 ++---
 python/cudf/cudf/tests/test_dlpack.py       |   9 +-
 8 files changed, 253 insertions(+), 231 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 0b7c40ff516..9f3f756a1e7 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -254,7 +254,7 @@ def _getitem_tuple_arg(self, arg):
         # Step 1: Gather columns
         if isinstance(arg, tuple):
             columns_df = self._frame._get_columns_by_label(arg[1])
-            columns_df._index = self._frame._index
+            columns_df.index = self._frame.index
         else:
             columns_df = self._frame
 
@@ -545,7 +545,7 @@ def __getitem__(self, arg):
     @_cudf_nvtx_annotate
     def _setitem_tuple_arg(self, key, value):
         columns_df = self._frame._from_data(
-            self._frame._data.select_by_index(key[1]), self._frame._index
+            self._frame._data.select_by_index(key[1]), self._frame.index
         )
 
         if is_scalar(value):
@@ -710,11 +710,11 @@ def __init__(
             if index is not None:
                 if not data.index.equals(index):
                     data = data.reindex(index)
-                    index = data._index
+                    index = data.index
                 else:
                     index = as_index(index)
             else:
-                index = data._index
+                index = data.index
 
             self._index = index
 
@@ -1176,7 +1176,7 @@ def _constructor_expanddim(self):
     def serialize(self):
         header, frames = super().serialize()
 
-        header["index"], index_frames = self._index.serialize()
+        header["index"], index_frames = self.index.serialize()
         header["index_frame_count"] = len(index_frames)
         # For backwards compatibility with older versions of cuDF, index
         # columns are placed before data columns.
@@ -1193,7 +1193,7 @@ def deserialize(cls, header, frames):
 
         idx_typ = pickle.loads(header["index"]["type-serialized"])
         index = idx_typ.deserialize(header["index"], frames[:index_nframes])
-        obj._index = index
+        obj.index = index
 
         return obj
 
@@ -1396,27 +1396,27 @@ def __setitem__(self, arg, value):
             else:
                 if arg in self._data:
                     if not is_scalar(value) and len(self) == 0:
+                        value = column.as_column(value)
+                        length = len(value)
+                        new_columns = (
+                            value
+                            if key == arg
+                            else column.column_empty_like(
+                                col, masked=True, newsize=length
+                            )
+                            for key, col in self._data.items()
+                        )
+                        self._data = self._data._from_columns_like_self(
+                            new_columns, verify=False
+                        )
                         if isinstance(value, (pd.Series, Series)):
                             self._index = as_index(value.index)
                         elif len(value) > 0:
-                            self._index = RangeIndex(start=0, stop=len(value))
-                        value = column.as_column(value)
-                        new_data = self._data.__class__()
-                        for key in self._data:
-                            if key == arg:
-                                new_data[key] = value
-                            else:
-                                new_data[key] = column.column_empty_like(
-                                    self._data[key],
-                                    masked=True,
-                                    newsize=len(value),
-                                )
-
-                        self._data = new_data
+                            self._index = RangeIndex(length)
                         return
                     elif isinstance(value, (pd.Series, Series)):
                         value = Series(value)._align_to_index(
-                            self._index,
+                            self.index,
                             how="right",
                             sort=False,
                             allow_non_unique=True,
@@ -1489,7 +1489,7 @@ def memory_usage(self, index=True, deep=False):
         mem_usage = [col.memory_usage for col in self._data.columns]
         names = [str(name) for name in self._data.names]
         if index:
-            mem_usage.append(self._index.memory_usage())
+            mem_usage.append(self.index.memory_usage())
             names.append("Index")
         return Series._from_data(
             data={None: as_column(mem_usage)},
@@ -1698,7 +1698,7 @@ def _concat(
                 []
                 if are_all_range_index
                 or (ignore_index and not empty_has_index)
-                else list(f._index._data.columns)
+                else list(f.index._data.columns)
             )
             + [f._data[name] if name in f._data else None for name in names]
             for f in objs
@@ -1761,11 +1761,9 @@ def _concat(
         # least one input frame has an index, assign a new RangeIndex
         # to the result frame.
         if empty_has_index and num_empty_input_frames == len(objs):
-            out._index = cudf.RangeIndex(result_index_length)
+            out.index = cudf.RangeIndex(result_index_length)
         elif are_all_range_index and not ignore_index:
-            out._index = cudf.core.index.Index._concat(
-                [o._index for o in objs]
-            )
+            out.index = cudf.core.index.Index._concat([o.index for o in objs])
 
         # Reassign the categories for any categorical table cols
         _reassign_categories(
@@ -1773,14 +1771,14 @@ def _concat(
         )
 
         # Reassign the categories for any categorical index cols
-        if not isinstance(out._index, cudf.RangeIndex):
+        if not isinstance(out.index, cudf.RangeIndex):
             _reassign_categories(
                 categories,
-                out._index._data,
+                out.index._data,
                 indices[:first_data_column_position],
             )
-            if not isinstance(out._index, MultiIndex) and isinstance(
-                out._index.dtype, cudf.CategoricalDtype
+            if not isinstance(out.index, MultiIndex) and isinstance(
+                out.index.dtype, cudf.CategoricalDtype
             ):
                 out = out.set_index(
                     cudf.core.index.as_index(out.index._values)
@@ -1796,8 +1794,8 @@ def _concat(
         else:
             out.columns = names
         if not ignore_index:
-            out._index.name = objs[0]._index.name
-            out._index.names = objs[0]._index.names
+            out.index.name = objs[0].index.name
+            out.index.names = objs[0].index.names
 
         return out
 
@@ -1965,7 +1963,7 @@ def _get_renderable_dataframe(self):
                 output = cudf.concat([upper, lower])
 
         output = self._clean_nulls_from_dataframe(output)
-        output._index = output._index._clean_nulls_from_index()
+        output.index = output.index._clean_nulls_from_index()
 
         return output
 
@@ -2036,7 +2034,7 @@ def _make_operands_and_index_for_binop(
         bool,
     ]:
         lhs, rhs = self._data, other
-        index = self._index
+        index = self.index
         fill_requires_key = False
         left_default: Any = False
         equal_columns = False
@@ -2081,7 +2079,7 @@ def _make_operands_and_index_for_binop(
                     "Can only compare identically-labeled DataFrame objects"
                 )
             new_lhs, new_rhs = _align_indices(self, other)
-            index = new_lhs._index
+            index = new_lhs.index
             lhs, rhs = new_lhs._data, new_rhs._data
             fill_requires_key = True
             # For DataFrame-DataFrame ops, always default to operating against
@@ -2455,7 +2453,7 @@ def scatter_by_map(
                 )
 
         partitioned_columns, output_offsets = libcudf.partitioning.partition(
-            [*(self._index._columns if keep_index else ()), *self._columns],
+            [*(self.index._columns if keep_index else ()), *self._columns],
             map_index,
             map_size,
         )
@@ -3248,23 +3246,28 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
         if len(self) == 0:
             if isinstance(value, (pd.Series, Series)):
                 if not ignore_index:
-                    self._index = as_index(value.index)
-            elif len(value) > 0:
-                self._index = RangeIndex(start=0, stop=len(value))
-                new_data = self._data.__class__()
+                    self.index = as_index(value.index)
+            elif (length := len(value)) > 0:
                 if num_cols != 0:
-                    for col_name in self._data:
-                        new_data[col_name] = column.column_empty_like(
-                            self._data[col_name],
-                            masked=True,
-                            newsize=len(value),
-                        )
-                self._data = new_data
+                    ca = self._data._from_columns_like_self(
+                        (
+                            column.column_empty_like(
+                                col_data, masked=True, newsize=length
+                            )
+                            for col_data in self._data.values()
+                        ),
+                        verify=False,
+                    )
+                else:
+                    ca = ColumnAccessor({})
+                self._data = ca
+                self._index = RangeIndex(length)
+
         elif isinstance(value, (pd.Series, Series)):
             value = Series(value, nan_as_null=nan_as_null)
             if not ignore_index:
                 value = value._align_to_index(
-                    self._index, how="right", sort=False
+                    self.index, how="right", sort=False
                 )
 
         value = column.as_column(value, nan_as_null=nan_as_null)
@@ -3293,7 +3296,7 @@ def axes(self):
             Index(['key', 'k2', 'val', 'temp'], dtype='object')]
 
         """
-        return [self._index, self._data.to_pandas_index()]
+        return [self.index, self._data.to_pandas_index()]
 
     def diff(self, periods=1, axis=0):
         """
@@ -4853,8 +4856,8 @@ def partition_by_hash(self, columns, nparts, keep_index=True):
         """
         key_indices = [self._column_names.index(k) for k in columns]
         if keep_index:
-            cols = [*self._index._columns, *self._columns]
-            key_indices = [i + len(self._index._columns) for i in key_indices]
+            cols = [*self.index._columns, *self._columns]
+            key_indices = [i + len(self.index._columns) for i in key_indices]
         else:
             cols = [*self._columns]
 
@@ -5019,13 +5022,13 @@ def info(
 
         lines = [str(type(self))]
 
-        index_name = type(self._index).__name__
-        if len(self._index) > 0:
-            entries_summary = f", {self._index[0]} to {self._index[-1]}"
+        index_name = type(self.index).__name__
+        if len(self.index) > 0:
+            entries_summary = f", {self.index[0]} to {self.index[-1]}"
         else:
             entries_summary = ""
         index_summary = (
-            f"{index_name}: {len(self._index)} entries{entries_summary}"
+            f"{index_name}: {len(self.index)} entries{entries_summary}"
         )
         lines.append(index_summary)
 
@@ -5629,7 +5632,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
         num_cols = len(data[0])
 
         if columns is None and data.dtype.names is None:
-            names = [i for i in range(num_cols)]
+            names = range(num_cols)
 
         elif data.dtype.names is not None:
             names = data.dtype.names
@@ -5642,28 +5645,43 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
                 )
             names = columns
 
-        df = DataFrame()
-
         if data.ndim == 2:
-            for i, k in enumerate(names):
-                df._data[k] = column.as_column(
-                    data[:, i], nan_as_null=nan_as_null
-                )
+            ca_data = {
+                k: column.as_column(data[:, i], nan_as_null=nan_as_null)
+                for i, k in enumerate(names)
+            }
         elif data.ndim == 1:
-            for k in names:
-                df._data[k] = column.as_column(
-                    data[k], nan_as_null=nan_as_null
-                )
+            ca_data = {
+                name: column.as_column(data[name], nan_as_null=nan_as_null)
+                for name in names
+            }
 
-        if index is None:
-            df._index = RangeIndex(start=0, stop=len(data))
-        elif is_scalar(index):
-            df._index = RangeIndex(start=0, stop=len(data))
-            df = df.set_index(index)
+        if not is_scalar(index):
+            new_index = as_index(index)
+        else:
+            new_index = None
+
+        if isinstance(columns, (pd.Index, cudf.Index)):
+            level_names = tuple(columns.names)
         else:
-            df._index = as_index(index)
-        if isinstance(columns, pd.Index):
-            df._data._level_names = tuple(columns.names)
+            level_names = None
+
+        df = cls._from_data(
+            ColumnAccessor(
+                data=ca_data,
+                multiindex=isinstance(
+                    columns, (pd.MultiIndex, cudf.MultiIndex)
+                ),
+                rangeindex=isinstance(
+                    columns, (range, pd.RangeIndex, cudf.RangeIndex)
+                ),
+                level_names=level_names,
+                label_dtype=getattr(columns, "dtype", None),
+            ),
+            index=new_index,
+        )
+        if is_scalar(index) and index is not None:
+            df = df.set_index(index)
         return df
 
     @classmethod
@@ -5712,26 +5730,38 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
                 raise ValueError("Duplicate column names are not allowed")
             names = columns
 
-        df = cls()
         if data.ndim == 2:
-            for i, k in enumerate(names):
-                df._data[k] = column.as_column(
-                    data[:, i], nan_as_null=nan_as_null
-                )
+            ca_data = {
+                k: column.as_column(data[:, i], nan_as_null=nan_as_null)
+                for i, k in enumerate(names)
+            }
         elif data.ndim == 1:
-            df._data[names[0]] = column.as_column(
-                data, nan_as_null=nan_as_null
-            )
-        if isinstance(columns, pd.Index):
-            df._data._level_names = tuple(columns.names)
-        if isinstance(columns, (range, pd.RangeIndex, cudf.RangeIndex)):
-            df._data.rangeindex = True
+            ca_data = {
+                names[0]: column.as_column(data, nan_as_null=nan_as_null)
+            }
 
-        if index is None:
-            df._index = RangeIndex(start=0, stop=len(data))
+        if index is not None:
+            index = as_index(index)
+
+        if isinstance(columns, (pd.Index, cudf.Index)):
+            level_names = tuple(columns.names)
         else:
-            df._index = as_index(index)
-        return df
+            level_names = None
+
+        return cls._from_data(
+            ColumnAccessor(
+                data=ca_data,
+                multiindex=isinstance(
+                    columns, (pd.MultiIndex, cudf.MultiIndex)
+                ),
+                rangeindex=isinstance(
+                    columns, (range, pd.RangeIndex, cudf.RangeIndex)
+                ),
+                level_names=level_names,
+                label_dtype=getattr(columns, "dtype", None),
+            ),
+            index=index,
+        )
 
     @_cudf_nvtx_annotate
     def interpolate(
@@ -7006,7 +7036,7 @@ def stack(self, level=-1, dropna=no_default, future_stack=False):
 
         # Assemble the final index
         new_index_columns = [*repeated_index._columns, *tiled_index]
-        index_names = [*self._index.names, *unique_named_levels.names]
+        index_names = [*self.index.names, *unique_named_levels.names]
         new_index = MultiIndex.from_frame(
             DataFrame._from_data(
                 dict(zip(range(0, len(new_index_columns)), new_index_columns))
@@ -7797,7 +7827,7 @@ def value_counts(
             result = result / result._column.sum()
         # Pandas always returns MultiIndex even if only one column.
         if not isinstance(result.index, MultiIndex):
-            result.index = MultiIndex._from_data(result._index._data)
+            result.index = MultiIndex._from_data(result.index._data)
         result.name = "proportion" if normalize else "count"
         return result
 
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index bf24864c29d..3e7a1ee6026 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -949,7 +949,7 @@ def nth(self, n):
 
         result = result[sizes > n]
 
-        result._index = self.obj.index.take(
+        result.index = self.obj.index.take(
             result._data["__groupbynth_order__"]
         )
         del result._data["__groupbynth_order__"]
@@ -1038,7 +1038,7 @@ def ngroup(self, ascending=True):
         if has_null_group:
             group_ids.iloc[-1] = cudf.NA
 
-        group_ids._index = index
+        group_ids.index = index
         return self._broadcast(group_ids)
 
     def sample(
@@ -1208,7 +1208,7 @@ def deserialize(cls, header, frames):
 
     def _grouped(self, *, include_groups: bool = True):
         offsets, grouped_key_cols, grouped_value_cols = self._groupby.groups(
-            [*self.obj._index._columns, *self.obj._columns]
+            [*self.obj.index._columns, *self.obj._columns]
         )
         grouped_keys = cudf.core.index._index_from_data(
             dict(enumerate(grouped_key_cols))
@@ -2849,8 +2849,8 @@ def _handle_label(self, by):
             self._key_columns.append(self._obj._data[by])
         except KeyError as e:
             # `by` can be index name(label) too.
-            if by in self._obj._index.names:
-                self._key_columns.append(self._obj._index._data[by])
+            if by in self._obj.index.names:
+                self._key_columns.append(self._obj.index._data[by])
             else:
                 raise e
         self.names.append(by)
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 7aae0d1729e..a166c256689 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -289,11 +289,11 @@ def __init__(self, data=None, index=None):
     @property
     def _num_rows(self) -> int:
         # Important to use the index because the data may be empty.
-        return len(self._index)
+        return len(self.index)
 
     @property
     def _index_names(self) -> Tuple[Any, ...]:  # TODO: Tuple[str]?
-        return self._index._data.names
+        return self.index._data.names
 
     @classmethod
     def _from_data(
@@ -307,7 +307,7 @@ def _from_data(
 
     @_cudf_nvtx_annotate
     def _from_data_like_self(self, data: MutableMapping):
-        out = self._from_data(data, self._index)
+        out = self._from_data(data, self.index)
         out._data._level_names = self._data._level_names
         return out
 
@@ -350,7 +350,7 @@ def _from_columns_like_self(
         frame = self.__class__._from_data(data)
 
         if index is not None:
-            frame._index = index
+            frame.index = index
         return frame._copy_type_metadata(
             self,
             include_index=bool(index_names),
@@ -367,7 +367,7 @@ def _mimic_inplace(
         self, result: Self, inplace: bool = False
     ) -> Optional[Self]:
         if inplace:
-            self._index = result._index
+            self._index = result.index
         return super()._mimic_inplace(result, inplace)
 
     # Scans
@@ -442,15 +442,15 @@ def _scan(self, op, axis=None, skipna=True):
                 # pandas returns an int64 dtype for all int or bool dtypes.
                 result_col = result_col.astype(np.int64)
             results[name] = getattr(result_col, op)()
-        return self._from_data(results, self._index)
+        return self._from_data(results, self.index)
 
     def _check_data_index_length_match(self) -> None:
         # Validate that the number of rows in the data matches the index if the
         # data is not empty. This is a helper for the constructor.
-        if self._data.nrows > 0 and self._data.nrows != len(self._index):
+        if self._data.nrows > 0 and self._data.nrows != len(self.index):
             raise ValueError(
                 f"Length of values ({self._data.nrows}) does not "
-                f"match length of index ({len(self._index)})"
+                f"match length of index ({len(self.index)})"
             )
 
     @property
@@ -618,14 +618,14 @@ def copy(self, deep: bool = True) -> Self:
         return self._from_data(
             self._data.copy(deep=deep),
             # Indexes are immutable so copies can always be shallow.
-            self._index.copy(deep=False),
+            self.index.copy(deep=False),
         )
 
     @_cudf_nvtx_annotate
     def equals(self, other):  # noqa: D102
         if not super().equals(other):
             return False
-        return self._index.equals(other._index)
+        return self.index.equals(other.index)
 
     @property
     def index(self):
@@ -908,7 +908,7 @@ def replace(
         else:
             copy_data = self._data.copy(deep=True)
 
-        result = self._from_data(copy_data, self._index)
+        result = self._from_data(copy_data, self.index)
 
         return self._mimic_inplace(result, inplace=inplace)
 
@@ -1033,7 +1033,7 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1):
             name: col.clip(lower[i], upper[i])
             for i, (name, col) in enumerate(self._data.items())
         }
-        output = self._from_data(data, self._index)
+        output = self._from_data(data, self.index)
         output._copy_type_metadata(self, include_index=False)
         return self._mimic_inplace(output, inplace=inplace)
 
@@ -1935,29 +1935,27 @@ def _copy_type_metadata(
         super()._copy_type_metadata(other, override_dtypes=override_dtypes)
         if (
             include_index
-            and self._index is not None
-            and other._index is not None
+            and self.index is not None
+            and other.index is not None
         ):
-            self._index._copy_type_metadata(other._index)
-            # When other._index is a CategoricalIndex, the current index
+            self.index._copy_type_metadata(other.index)
+            # When other.index is a CategoricalIndex, the current index
             # will be a NumericalIndex with an underlying CategoricalColumn
             # (the above _copy_type_metadata call will have converted the
             # column). Calling cudf.Index on that column generates the
             # appropriate index.
             if isinstance(
-                other._index, cudf.core.index.CategoricalIndex
-            ) and not isinstance(
-                self._index, cudf.core.index.CategoricalIndex
-            ):
-                self._index = cudf.Index(
-                    cast("cudf.Index", self._index)._column,
-                    name=self._index.name,
+                other.index, cudf.core.index.CategoricalIndex
+            ) and not isinstance(self.index, cudf.core.index.CategoricalIndex):
+                self.index = cudf.Index(
+                    cast("cudf.Index", self.index)._column,
+                    name=self.index.name,
                 )
-            elif isinstance(other._index, cudf.MultiIndex) and not isinstance(
-                self._index, cudf.MultiIndex
+            elif isinstance(other.index, cudf.MultiIndex) and not isinstance(
+                self.index, cudf.MultiIndex
             ):
-                self._index = cudf.MultiIndex._from_data(
-                    self._index._data, name=self._index.name
+                self.index = cudf.MultiIndex._from_data(
+                    self.index._data, name=self.index.name
                 )
         return self
 
@@ -2017,8 +2015,8 @@ def interpolate(
 
         data = self
 
-        if not isinstance(data._index, cudf.RangeIndex):
-            perm_sort = data._index.argsort()
+        if not isinstance(data.index, cudf.RangeIndex):
+            perm_sort = data.index.argsort()
             data = data._gather(
                 GatherMap.from_column_unchecked(
                     cudf.core.column.as_column(perm_sort),
@@ -2040,13 +2038,13 @@ def interpolate(
                 col = col.astype("float64").fillna(np.nan)
 
             # Interpolation methods may or may not need the index
-            columns[colname] = interpolator(col, index=data._index)
+            columns[colname] = interpolator(col, index=data.index)
 
-        result = self._from_data(columns, index=data._index)
+        result = self._from_data(columns, index=data.index)
 
         return (
             result
-            if isinstance(data._index, cudf.RangeIndex)
+            if isinstance(data.index, cudf.RangeIndex)
             # TODO: This should be a scatter, avoiding an argsort.
             else result._gather(
                 GatherMap.from_column_unchecked(
@@ -2070,7 +2068,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
             col.shift(periods, fill_value) for col in self._columns
         )
         return self.__class__._from_data(
-            zip(self._column_names, data_columns), self._index
+            zip(self._column_names, data_columns), self.index
         )
 
     @_cudf_nvtx_annotate
@@ -2254,7 +2252,7 @@ def truncate(self, before=None, after=None, axis=0, copy=True):
         if not copy:
             raise ValueError("Truncating with copy=False is not supported.")
         axis = self._get_axis_from_axis_arg(axis)
-        ax = self._index if axis == 0 else self._data.to_pandas_index()
+        ax = self.index if axis == 0 else self._data.to_pandas_index()
 
         if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing:
             raise ValueError("truncate requires a sorted index")
@@ -2585,7 +2583,7 @@ def scale(self):
         vmin = self.min()
         vmax = self.max()
         scaled = (self - vmin) / (vmax - vmin)
-        scaled._index = self._index.copy(deep=False)
+        scaled.index = self.index.copy(deep=False)
         return scaled
 
     @_cudf_nvtx_annotate
@@ -2919,14 +2917,14 @@ def _gather(
             raise IndexError("Gather map is out of bounds")
         return self._from_columns_like_self(
             libcudf.copying.gather(
-                list(self._index._columns + self._columns)
+                list(self.index._columns + self._columns)
                 if keep_index
                 else list(self._columns),
                 gather_map.column,
                 nullify=gather_map.nullify,
             ),
             self._column_names,
-            self._index.names if keep_index else None,
+            self.index.names if keep_index else None,
         )
 
     def _slice(self, arg: slice, keep_index: bool = True) -> Self:
@@ -3000,7 +2998,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
 
         columns_to_slice = [
             *(
-                self._index._data.columns
+                self.index._data.columns
                 if keep_index and not has_range_index
                 else []
             ),
@@ -3009,7 +3007,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
         result = self._from_columns_like_self(
             libcudf.copying.columns_slice(columns_to_slice, [start, stop])[0],
             self._column_names,
-            None if has_range_index or not keep_index else self._index.names,
+            None if has_range_index or not keep_index else self.index.names,
         )
         result._data.label_dtype = self._data.label_dtype
         result._data.rangeindex = self._data.rangeindex
@@ -3028,7 +3026,7 @@ def _positions_from_column_names(
         indices returned corresponds to the column order in this Frame.
         """
         num_index_columns = (
-            len(self._index._data) if offset_by_index_columns else 0
+            len(self.index._data) if offset_by_index_columns else 0
         )
         return [
             i + num_index_columns
@@ -3073,13 +3071,13 @@ def drop_duplicates(
             libcudf.stream_compaction.drop_duplicates(
                 list(self._columns)
                 if ignore_index
-                else list(self._index._columns + self._columns),
+                else list(self.index._columns + self._columns),
                 keys=keys,
                 keep=keep,
                 nulls_are_equal=nulls_are_equal,
             ),
             self._column_names,
-            self._index.names if not ignore_index else None,
+            self.index.names if not ignore_index else None,
         )
 
     @_cudf_nvtx_annotate
@@ -3197,12 +3195,12 @@ def _empty_like(self, keep_index=True) -> Self:
         result = self._from_columns_like_self(
             libcudf.copying.columns_empty_like(
                 [
-                    *(self._index._data.columns if keep_index else ()),
+                    *(self.index._data.columns if keep_index else ()),
                     *self._columns,
                 ]
             ),
             self._column_names,
-            self._index.names if keep_index else None,
+            self.index.names if keep_index else None,
         )
         result._data.label_dtype = self._data.label_dtype
         result._data.rangeindex = self._data.rangeindex
@@ -3214,7 +3212,7 @@ def _split(self, splits, keep_index=True):
 
         columns_split = libcudf.copying.columns_split(
             [
-                *(self._index._data.columns if keep_index else []),
+                *(self.index._data.columns if keep_index else []),
                 *self._columns,
             ],
             splits,
@@ -3224,7 +3222,7 @@ def _split(self, splits, keep_index=True):
             self._from_columns_like_self(
                 columns_split[i],
                 self._column_names,
-                self._index.names if keep_index else None,
+                self.index.names if keep_index else None,
             )
             for i in range(len(splits) + 1)
         ]
@@ -3244,12 +3242,12 @@ def fillna(
                 "Use obj.ffill() or obj.bfill() instead.",
                 FutureWarning,
             )
-        old_index = self._index
+        old_index = self.index
         ret = super().fillna(value, method, axis, inplace, limit)
         if inplace:
-            self._index = old_index
+            self.index = old_index
         else:
-            ret._index = old_index
+            ret.index = old_index
         return ret
 
     @_cudf_nvtx_annotate
@@ -3479,7 +3477,7 @@ def _apply(self, func, kernel_getter, *args, **kwargs):
         col = _post_process_output_col(ans_col, retty)
 
         col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask))
-        result = cudf.Series._from_data({None: col}, self._index)
+        result = cudf.Series._from_data({None: col}, self.index)
 
         return result
 
@@ -3706,12 +3704,12 @@ def _reindex(
 
         df = self
         if index is not None:
-            if not df._index.is_unique:
+            if not df.index.is_unique:
                 raise ValueError(
                     "cannot reindex on an axis with duplicate labels"
                 )
             index = cudf.core.index.as_index(
-                index, name=getattr(index, "name", self._index.name)
+                index, name=getattr(index, "name", self.index.name)
             )
 
             idx_dtype_match = (df.index.nlevels == index.nlevels) and all(
@@ -3739,7 +3737,7 @@ def _reindex(
                         else name: col
                         for name, col in df._data.items()
                     },
-                    index=df._index,
+                    index=df.index,
                 )
                 df = lhs.join(rhs, how="left", sort=True)
                 # double-argsort to map back from sorted to unsorted positions
@@ -3915,7 +3913,7 @@ def round(self, decimals=0, how="half_even"):
                 multiindex=self._data.multiindex,
                 level_names=self._data.level_names,
             ),
-            index=self._index,
+            index=self.index,
         )
 
     def resample(
@@ -4267,7 +4265,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None):
 
         return self._from_columns_like_self(
             libcudf.stream_compaction.drop_nulls(
-                [*self._index._data.columns, *data_columns],
+                [*self.index._data.columns, *data_columns],
                 how=how,
                 keys=self._positions_from_column_names(
                     subset, offset_by_index_columns=True
@@ -4275,7 +4273,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None):
                 thresh=thresh,
             ),
             self._column_names,
-            self._index.names,
+            self.index.names,
         )
 
     def _apply_boolean_mask(self, boolean_mask: BooleanMask, keep_index=True):
@@ -4292,13 +4290,13 @@ def _apply_boolean_mask(self, boolean_mask: BooleanMask, keep_index=True):
             )
         return self._from_columns_like_self(
             libcudf.stream_compaction.apply_boolean_mask(
-                list(self._index._columns + self._columns)
+                list(self.index._columns + self._columns)
                 if keep_index
                 else list(self._columns),
                 boolean_mask.column,
             ),
             column_names=self._column_names,
-            index_names=self._index.names if keep_index else None,
+            index_names=self.index.names if keep_index else None,
         )
 
     def take(self, indices, axis=0):
@@ -4358,7 +4356,7 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""):
                 )
             if not isinstance(level, (tuple, list)):
                 level = (level,)
-        _check_duplicate_level_names(level, self._index.names)
+        _check_duplicate_level_names(level, self.index.names)
 
         index = self.index._new_index_for_reset_index(level, self.index.name)
         if index is None:
@@ -4394,7 +4392,7 @@ def _first_or_last(
         self, offset, idx: int, op: Callable, side: str, slice_func: Callable
     ) -> "IndexedFrame":
         """Shared code path for ``first`` and ``last``."""
-        if not isinstance(self._index, cudf.core.index.DatetimeIndex):
+        if not isinstance(self.index, cudf.core.index.DatetimeIndex):
             raise TypeError("'first' only supports a DatetimeIndex index.")
         if not isinstance(offset, str):
             raise NotImplementedError(
@@ -4406,20 +4404,20 @@ def _first_or_last(
 
         pd_offset = pd.tseries.frequencies.to_offset(offset)
         to_search = op(
-            pd.Timestamp(self._index._column.element_indexing(idx)), pd_offset
+            pd.Timestamp(self.index._column.element_indexing(idx)), pd_offset
         )
         if (
             idx == 0
             and not isinstance(pd_offset, pd.tseries.offsets.Tick)
-            and pd_offset.is_on_offset(pd.Timestamp(self._index[0]))
+            and pd_offset.is_on_offset(pd.Timestamp(self.index[0]))
         ):
             # Special handle is required when the start time of the index
             # is on the end of the offset. See pandas gh29623 for detail.
             to_search = to_search - pd_offset.base
             return self.loc[:to_search]
-        needle = as_column(to_search, dtype=self._index.dtype)
+        needle = as_column(to_search, dtype=self.index.dtype)
         end_point = int(
-            self._index._column.searchsorted(
+            self.index._column.searchsorted(
                 needle, side=side
             ).element_indexing(0)
         )
@@ -4802,7 +4800,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
                     name: (col, None, False, None)
                     for name, col in self._data.items()
                 }
-                index = self._index
+                index = self.index
 
             data = self._apply_cupy_ufunc_to_operands(
                 ufunc, cupy_func, inputs, **kwargs
@@ -4880,7 +4878,7 @@ def repeat(self, repeats, axis=None):
         """
         res = self._from_columns_like_self(
             Frame._repeat(
-                [*self._index._data.columns, *self._columns], repeats, axis
+                [*self.index._data.columns, *self._columns], repeats, axis
             ),
             self._column_names,
             self._index_names,
@@ -5011,7 +5009,7 @@ def astype(
                 raise e
             return self
 
-        return self._from_data(data, index=self._index)
+        return self._from_data(data, index=self.index)
 
     @_cudf_nvtx_annotate
     def drop(
@@ -5220,8 +5218,7 @@ def drop(
                 columns = _get_host_unique(columns)
                 _drop_columns(dropped, columns, errors)
 
-            out._data = dropped._data
-            out._index = dropped._index
+            out._mimic_inplace(dropped, inplace=True)
 
         if not inplace:
             return out
@@ -5234,18 +5231,18 @@ def _explode(self, explode_column: Any, ignore_index: bool):
         # exploded and will be replaced with a `RangeIndex`.
         if not isinstance(self._data[explode_column].dtype, ListDtype):
             data = self._data.copy(deep=True)
-            idx = None if ignore_index else self._index.copy(deep=True)
+            idx = None if ignore_index else self.index.copy(deep=True)
             return self.__class__._from_data(data, index=idx)
 
         column_index = self._column_names.index(explode_column)
-        if not ignore_index and self._index is not None:
-            index_offset = self._index.nlevels
+        if not ignore_index and self.index is not None:
+            index_offset = self.index.nlevels
         else:
             index_offset = 0
 
         exploded = libcudf.lists.explode_outer(
             [
-                *(self._index._data.columns if not ignore_index else ()),
+                *(self.index._data.columns if not ignore_index else ()),
                 *self._columns,
             ],
             column_index + index_offset,
@@ -5292,7 +5289,7 @@ def tile(self, count):
         """
         return self._from_columns_like_self(
             libcudf.reshape.tile(
-                [*self._index._columns, *self._columns], count
+                [*self.index._columns, *self._columns], count
             ),
             column_names=self._column_names,
             index_names=self._index_names,
@@ -6273,7 +6270,7 @@ def rank(
 
         return self.__class__._from_data(
             dict(zip(source._column_names, result_columns)),
-            index=source._index,
+            index=source.index,
         ).astype(np.float64)
 
     def convert_dtypes(
@@ -6505,7 +6502,7 @@ def _is_series(obj):
     Checks if the `obj` is of type `cudf.Series`
     instead of checking for isinstance(obj, cudf.Series)
     """
-    return isinstance(obj, Frame) and obj.ndim == 1 and obj._index is not None
+    return isinstance(obj, Frame) and obj.ndim == 1 and obj.index is not None
 
 
 @_cudf_nvtx_annotate
@@ -6518,7 +6515,7 @@ def _drop_rows_by_labels(
     """Remove rows specified by `labels`.
 
     If `errors="raise"`, an error is raised if some items in `labels` do not
-    exist in `obj._index`.
+    exist in `obj.index`.
 
     Will raise if level(int) is greater or equal to index nlevels.
     """
@@ -6539,17 +6536,17 @@ def _drop_rows_by_labels(
         if isinstance(level, int):
             ilevel = level
         else:
-            ilevel = obj._index.names.index(level)
+            ilevel = obj.index.names.index(level)
 
         # 1. Merge Index df and data df along column axis:
-        # | id | ._index df | data column(s) |
-        idx_nlv = obj._index.nlevels
-        working_df = obj._index.to_frame(index=False)
+        # | id | .index df | data column(s) |
+        idx_nlv = obj.index.nlevels
+        working_df = obj.index.to_frame(index=False)
         working_df.columns = list(range(idx_nlv))
         for i, col in enumerate(obj._data):
             working_df[idx_nlv + i] = obj._data[col]
         # 2. Set `level` as common index:
-        # | level | ._index df w/o level | data column(s) |
+        # | level | .index df w/o level | data column(s) |
         working_df = working_df.set_index(level)
 
         # 3. Use "leftanti" join to drop
@@ -6560,11 +6557,11 @@ def _drop_rows_by_labels(
 
         # 4. Reconstruct original layout, and rename
         join_res._insert(
-            ilevel, name=join_res._index.name, value=join_res._index
+            ilevel, name=join_res.index.name, value=join_res.index
         )
 
         midx = cudf.MultiIndex.from_frame(
-            join_res.iloc[:, 0:idx_nlv], names=obj._index.names
+            join_res.iloc[:, 0:idx_nlv], names=obj.index.names
         )
 
         if isinstance(obj, cudf.Series):
@@ -6596,7 +6593,7 @@ def _drop_rows_by_labels(
         # Join changes the index to common type,
         # but we need to preserve the type of
         # index being returned, Hence this type-cast.
-        res._index = res.index.astype(obj.index.dtype)
+        res.index = res.index.astype(obj.index.dtype)
         return res
 
 
diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py
index 6a619945e75..05cbb4429b9 100644
--- a/python/cudf/cudf/core/join/_join_helpers.py
+++ b/python/cudf/cudf/core/join/_join_helpers.py
@@ -43,10 +43,10 @@ def set(self, obj: cudf.DataFrame, value: ColumnBase, validate=False):
 
 class _IndexIndexer(_Indexer):
     def get(self, obj: cudf.DataFrame) -> ColumnBase:
-        return obj._index._data[self.name]
+        return obj.index._data[self.name]
 
     def set(self, obj: cudf.DataFrame, value: ColumnBase, validate=False):
-        obj._index._data.set_by_label(self.name, value, validate=validate)
+        obj.index._data.set_by_label(self.name, value, validate=validate)
 
 
 def _match_join_keys(
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index 1ef2915bc59..da999441ca3 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -373,10 +373,10 @@ def _merge_results(
         index: Optional[cudf.BaseIndex]
         if self._using_right_index:
             # right_index and left_on
-            index = left_result._index
+            index = left_result.index
         elif self._using_left_index:
             # left_index and right_on
-            index = right_result._index
+            index = right_result.index
         else:
             index = None
 
@@ -400,7 +400,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame:
         # producing the input result.
         by: List[Any] = []
         if self._using_left_index and self._using_right_index:
-            by.extend(result._index._data.columns)
+            by.extend(result.index._data.columns)
         if not self._using_left_index:
             by.extend([result._data[col.name] for col in self._left_keys])
         if not self._using_right_index:
@@ -408,8 +408,8 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame:
         if by:
             keep_index = self._using_left_index or self._using_right_index
             if keep_index:
-                to_sort = [*result._index._columns, *result._columns]
-                index_names = result._index.names
+                to_sort = [*result.index._columns, *result._columns]
+                index_names = result.index.names
             else:
                 to_sort = [*result._columns]
                 index_names = None
@@ -547,4 +547,4 @@ class MergeSemi(Merge):
 
     def _merge_results(self, lhs: cudf.DataFrame, rhs: cudf.DataFrame):
         # semi-join result includes only lhs columns
-        return lhs._data, lhs._index
+        return lhs._data, lhs.index
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 0b44ab58f30..d4772d5b4c2 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -836,7 +836,7 @@ def get_dummies(
                     dtype=dtype,
                 )
                 result_data.update(col_enc_data)
-            return cudf.DataFrame._from_data(result_data, index=df._index)
+            return cudf.DataFrame._from_data(result_data, index=df.index)
     else:
         ser = cudf.Series(df)
         unique = _get_unique(column=ser._column, dummy_na=dummy_na)
@@ -847,7 +847,7 @@ def get_dummies(
             prefix_sep=prefix_sep,
             dtype=dtype,
         )
-        return cudf.DataFrame._from_data(data, index=ser._index)
+        return cudf.DataFrame._from_data(data, index=ser.index)
 
 
 def _merge_sorted(
@@ -899,7 +899,7 @@ def _merge_sorted(
         raise ValueError("`by_index` and `ignore_index` cannot both be True")
 
     if by_index:
-        key_columns_indices = list(range(0, objs[0]._index.nlevels))
+        key_columns_indices = list(range(0, objs[0].index.nlevels))
     else:
         if keys is None:
             key_columns_indices = list(range(0, objs[0]._num_columns))
@@ -909,12 +909,12 @@ def _merge_sorted(
             ]
         if not ignore_index:
             key_columns_indices = [
-                idx + objs[0]._index.nlevels for idx in key_columns_indices
+                idx + objs[0].index.nlevels for idx in key_columns_indices
             ]
 
     columns = [
         [
-            *(obj._index._data.columns if not ignore_index else ()),
+            *(obj.index._data.columns if not ignore_index else ()),
             *obj._columns,
         ]
         for obj in objs
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index c7bc97edd68..41fbf269699 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -296,7 +296,7 @@ def __getitem__(self, arg: Any) -> Union[ScalarLike, DataFrameOrSeries]:
             result = self._frame.index._get_row_major(self._frame, row_arg)
             if (
                 isinstance(arg, tuple)
-                and len(arg) == self._frame._index.nlevels
+                and len(arg) == self._frame.index.nlevels
                 and not any(isinstance(x, slice) for x in arg)
             ):
                 result = result.iloc[0]
@@ -318,7 +318,7 @@ def __setitem__(self, key, value):
                 and not isinstance(self._frame.index, cudf.MultiIndex)
                 and is_scalar(value)
             ):
-                idx = self._frame._index
+                idx = self._frame.index
                 if isinstance(idx, cudf.RangeIndex):
                     if isinstance(key, int) and (key == idx[-1] + idx.step):
                         idx_copy = cudf.RangeIndex(
@@ -682,7 +682,7 @@ def _from_data(
 
     @_cudf_nvtx_annotate
     def __contains__(self, item):
-        return item in self._index
+        return item in self.index
 
     @classmethod
     @_cudf_nvtx_annotate
@@ -832,7 +832,7 @@ def hasnans(self):
     def serialize(self):
         header, frames = super().serialize()
 
-        header["index"], index_frames = self._index.serialize()
+        header["index"], index_frames = self.index.serialize()
         header["index_frame_count"] = len(index_frames)
         # For backwards compatibility with older versions of cuDF, index
         # columns are placed before data columns.
@@ -850,7 +850,7 @@ def deserialize(cls, header, frames):
 
         idx_typ = pickle.loads(header["index"]["type-serialized"])
         index = idx_typ.deserialize(header["index"], frames[:index_nframes])
-        obj._index = index
+        obj.index = index
 
         return obj
 
@@ -995,7 +995,7 @@ def reindex(self, *args, **kwargs):
                     "'index' passed as both positional and keyword argument"
                 )
         else:
-            index = kwargs.get("index", self._index)
+            index = kwargs.get("index", self.index)
 
         name = self.name or 0
         series = self._reindex(
@@ -1140,7 +1140,7 @@ def to_frame(self, name=None):
     @_cudf_nvtx_annotate
     def memory_usage(self, index=True, deep=False):
         return self._column.memory_usage + (
-            self._index.memory_usage() if index else 0
+            self.index.memory_usage() if index else 0
         )
 
     @_cudf_nvtx_annotate
@@ -1506,7 +1506,7 @@ def _make_operands_and_index_for_binop(
             can_use_self_column_name = False
 
         operands = lhs._make_operands_for_binop(other, fill_value, reflect)
-        return operands, lhs._index, can_use_self_column_name
+        return operands, lhs.index, can_use_self_column_name
 
     @copy_docstring(CategoricalAccessor)  # type: ignore
     @property
@@ -1917,7 +1917,7 @@ def between(self, left, right, inclusive="both") -> Series:
                 "Inclusive has to be either string of 'both', "
                 "'left', 'right', or 'neither'."
             )
-        return self._from_data({self.name: lmask & rmask}, self._index)
+        return self._from_data({self.name: lmask & rmask}, self.index)
 
     @_cudf_nvtx_annotate
     def all(self, axis=0, bool_only=None, skipna=True, **kwargs):
@@ -3119,7 +3119,7 @@ def value_counts(
                 # TODO: Remove this workaround once `observed`
                 # parameter support is added to `groupby`
                 res = res.reindex(self.dtype.categories).fillna(0)
-                res._index = res._index.astype(self.dtype)
+                res.index = res.index.astype(self.dtype)
 
         res.index.name = self.name
 
@@ -3927,7 +3927,7 @@ def microsecond(self):
                 * cudf.Scalar(1000, dtype="int32")
             )
             + self.series._column.get_dt_field("microsecond"),
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
 
@@ -4161,7 +4161,7 @@ def is_leap_year(self):
         res = libcudf.datetime.is_leap_year(self.series._column).fillna(False)
         return Series._from_data(
             ColumnAccessor({None: res}),
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
 
@@ -4195,7 +4195,7 @@ def quarter(self):
         )
         return Series._from_data(
             {None: res},
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
 
@@ -4299,7 +4299,7 @@ def days_in_month(self):
         res = libcudf.datetime.days_in_month(self.series._column)
         return Series._from_data(
             ColumnAccessor({None: res}),
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
 
@@ -4345,7 +4345,7 @@ def is_month_end(self):
         last_day = libcudf.datetime.last_day_of_month(self.series._column)
         last_day = Series._from_data(
             ColumnAccessor({None: last_day}),
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
         return (self.day == last_day.dt.day).fillna(False)
@@ -4395,7 +4395,7 @@ def is_quarter_start(self):
         result = ((day == cudf.Scalar(1)) & first_month).fillna(False)
         return Series._from_data(
             {None: result},
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
 
@@ -4446,7 +4446,7 @@ def is_quarter_end(self):
         result = ((day == last_day) & last_month).fillna(False)
         return Series._from_data(
             {None: result},
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
 
@@ -4481,7 +4481,7 @@ def is_year_start(self):
         ) == cudf.Scalar(1)
         return Series._from_data(
             {None: outcol.fillna(False)},
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
 
@@ -4520,7 +4520,7 @@ def is_year_end(self):
         result = result.fillna(False)
         return Series._from_data(
             {None: result},
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
 
@@ -4528,7 +4528,7 @@ def is_year_end(self):
     def _get_dt_field(self, field):
         out_column = self.series._column.get_dt_field(field)
         return Series(
-            data=out_column, index=self.series._index, name=self.series.name
+            data=out_column, index=self.series.index, name=self.series.name
         )
 
     @_cudf_nvtx_annotate
@@ -4565,7 +4565,7 @@ def ceil(self, freq):
         out_column = self.series._column.ceil(freq)
 
         return Series._from_data(
-            data={self.series.name: out_column}, index=self.series._index
+            data={self.series.name: out_column}, index=self.series.index
         )
 
     @_cudf_nvtx_annotate
@@ -4602,7 +4602,7 @@ def floor(self, freq):
         out_column = self.series._column.floor(freq)
 
         return Series._from_data(
-            data={self.series.name: out_column}, index=self.series._index
+            data={self.series.name: out_column}, index=self.series.index
         )
 
     @_cudf_nvtx_annotate
@@ -4642,7 +4642,7 @@ def round(self, freq):
         out_column = self.series._column.round(freq)
 
         return Series._from_data(
-            data={self.series.name: out_column}, index=self.series._index
+            data={self.series.name: out_column}, index=self.series.index
         )
 
     @_cudf_nvtx_annotate
@@ -4724,7 +4724,7 @@ def strftime(self, date_format, *args, **kwargs):
             dtype="str", format=date_format
         )
         return Series(
-            data=str_col, index=self.series._index, name=self.series.name
+            data=str_col, index=self.series.index, name=self.series.name
         )
 
     @copy_docstring(DatetimeIndex.tz_localize)
@@ -4739,7 +4739,7 @@ def tz_localize(
         )
         return Series._from_data(
             data={self.series.name: result_col},
-            index=self.series._index,
+            index=self.series.index,
         )
 
     @copy_docstring(DatetimeIndex.tz_convert)
@@ -4755,7 +4755,7 @@ def tz_convert(self, tz: str | None):
         """
         result_col = self.series._column.tz_convert(tz)
         return Series._from_data(
-            {self.series.name: result_col}, index=self.series._index
+            {self.series.name: result_col}, index=self.series.index
         )
 
 
@@ -4993,13 +4993,13 @@ def components(self):
         3      0      0       35       35           656             0            0
         4     37     13       12       14           234             0            0
         """  # noqa: E501
-        return self.series._column.components(index=self.series._index)
+        return self.series._column.components(index=self.series.index)
 
     @_cudf_nvtx_annotate
     def _get_td_field(self, field):
         out_column = getattr(self.series._column, field)
         return Series(
-            data=out_column, index=self.series._index, name=self.series.name
+            data=out_column, index=self.series.index, name=self.series.name
         )
 
 
diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py
index 6e34817c4fd..aafe920d3a1 100644
--- a/python/cudf/cudf/tests/test_dlpack.py
+++ b/python/cudf/cudf/tests/test_dlpack.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import itertools
 from contextlib import ExitStack as does_not_raise
@@ -201,12 +201,7 @@ def test_to_dlpack_mixed_dtypes():
     "shape",
     [
         (0, 3),
-        pytest.param(
-            (3, 0),
-            marks=pytest.mark.xfail(
-                reason="Index information not available via from_dlpack"
-            ),
-        ),
+        (3, 0),
         (0, 0),
     ],
 )

From b4ce6e4815dbf1af533312a2b0350303a7db785d Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Wed, 22 May 2024 13:20:10 -0700
Subject: [PATCH 263/272] Expose some Parquet per-column configuration options
 via the python API (#15613)

Several recent PRs (#15081, #15411, #15600) added the ability to control some aspects of Parquet file writing on a per-column basis. During discussion of #15081 it was [suggested](https://github.com/rapidsai/cudf/pull/15081#issuecomment-1979731930) that these options be exposed by cuDF-python in a manner similar to pyarrow. This PR adds the ability to control per-column encoding, compression, binary output, and fixed-length data width, using fully qualified Parquet column names. For example, given a cuDF table with an integer column 'a', and a `list<int32>` column 'b', the fully qualified column names would be 'a' and 'b.list.element'.

Addresses "Add cuDF-python API support for specifying encodings" task in #13501.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vukasin Milovanovic (https://github.com/vuule)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15613
---
 python/cudf/cudf/_lib/parquet.pyx             | 74 +++++++++++++++++-
 .../cudf/_lib/pylibcudf/libcudf/io/types.pxd  | 18 ++++-
 python/cudf/cudf/core/dataframe.py            |  8 ++
 python/cudf/cudf/io/parquet.py                | 64 ++++++++++++++++
 python/cudf/cudf/tests/test_parquet.py        | 76 +++++++++++++++++++
 python/cudf/cudf/utils/ioutils.py             | 16 ++++
 6 files changed, 252 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 70acb7f917b..f0eef9be124 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -408,6 +408,10 @@ def write_parquet(
     object force_nullable_schema=False,
     header_version="1.0",
     use_dictionary=True,
+    object skip_compression=None,
+    object column_encoding=None,
+    object column_type_length=None,
+    object output_as_binary=None,
 ):
     """
     Cython function to call into libcudf API, see `write_parquet`.
@@ -458,7 +462,12 @@ def write_parquet(
         _set_col_metadata(
             table[name]._column,
             tbl_meta.column_metadata[i],
-            force_nullable_schema
+            force_nullable_schema,
+            None,
+            skip_compression,
+            column_encoding,
+            column_type_length,
+            output_as_binary
         )
 
     cdef map[string, string] tmp_user_data
@@ -810,16 +819,62 @@ cdef cudf_io_types.compression_type _get_comp_type(object compression):
         raise ValueError("Unsupported `compression` type")
 
 
+cdef cudf_io_types.column_encoding _get_encoding_type(object encoding):
+    if encoding is None:
+        return cudf_io_types.column_encoding.USE_DEFAULT
+
+    enc = str(encoding).upper()
+    if enc == "PLAIN":
+        return cudf_io_types.column_encoding.PLAIN
+    elif enc == "DICTIONARY":
+        return cudf_io_types.column_encoding.DICTIONARY
+    elif enc == "DELTA_BINARY_PACKED":
+        return cudf_io_types.column_encoding.DELTA_BINARY_PACKED
+    elif enc == "DELTA_LENGTH_BYTE_ARRAY":
+        return cudf_io_types.column_encoding.DELTA_LENGTH_BYTE_ARRAY
+    elif enc == "DELTA_BYTE_ARRAY":
+        return cudf_io_types.column_encoding.DELTA_BYTE_ARRAY
+    elif enc == "BYTE_STREAM_SPLIT":
+        return cudf_io_types.column_encoding.BYTE_STREAM_SPLIT
+    elif enc == "USE_DEFAULT":
+        return cudf_io_types.column_encoding.USE_DEFAULT
+    else:
+        raise ValueError("Unsupported `column_encoding` type")
+
+
 cdef _set_col_metadata(
     Column col,
     column_in_metadata& col_meta,
     bool force_nullable_schema=False,
+    str path=None,
+    object skip_compression=None,
+    object column_encoding=None,
+    object column_type_length=None,
+    object output_as_binary=None,
 ):
+    need_path = (skip_compression is not None or column_encoding is not None or
+                 column_type_length is not None or output_as_binary is not None)
+    name = col_meta.get_name().decode('UTF-8') if need_path else None
+    full_path = path + "." + name if path is not None else name
+
     if force_nullable_schema:
         # Only set nullability if `force_nullable_schema`
         # is true.
         col_meta.set_nullability(True)
 
+    if skip_compression is not None and full_path in skip_compression:
+        col_meta.set_skip_compression(True)
+
+    if column_encoding is not None and full_path in column_encoding:
+        col_meta.set_encoding(_get_encoding_type(column_encoding[full_path]))
+
+    if column_type_length is not None and full_path in column_type_length:
+        col_meta.set_output_as_binary(True)
+        col_meta.set_type_length(column_type_length[full_path])
+
+    if output_as_binary is not None and full_path in output_as_binary:
+        col_meta.set_output_as_binary(True)
+
     if isinstance(col.dtype, cudf.StructDtype):
         for i, (child_col, name) in enumerate(
             zip(col.children, list(col.dtype.fields))
@@ -828,13 +883,26 @@ cdef _set_col_metadata(
             _set_col_metadata(
                 child_col,
                 col_meta.child(i),
-                force_nullable_schema
+                force_nullable_schema,
+                full_path,
+                skip_compression,
+                column_encoding,
+                column_type_length,
+                output_as_binary
             )
     elif isinstance(col.dtype, cudf.ListDtype):
+        if full_path is not None:
+            full_path = full_path + ".list"
+            col_meta.child(1).set_name("element".encode())
         _set_col_metadata(
             col.children[1],
             col_meta.child(1),
-            force_nullable_schema
+            force_nullable_schema,
+            full_path,
+            skip_compression,
+            column_encoding,
+            column_type_length,
+            output_as_binary
         )
     elif isinstance(col.dtype, cudf.core.dtypes.DecimalDtype):
         col_meta.set_decimal_precision(col.dtype.precision)
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
index 4725c4e5937..38fae1df1e5 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libc.stdint cimport uint8_t
+from libc.stdint cimport int32_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
@@ -57,6 +57,19 @@ cdef extern from "cudf/io/types.hpp" \
         ADAPTIVE = 1,
         ALWAYS = 2,
 
+    cdef extern from "cudf/io/types.hpp" namespace "cudf::io" nogil:
+        cpdef enum class column_encoding:
+            USE_DEFAULT = -1
+            DICTIONARY = 0
+            PLAIN = 1
+            DELTA_BINARY_PACKED = 2
+            DELTA_LENGTH_BYTE_ARRAY =3
+            DELTA_BYTE_ARRAY = 4
+            BYTE_STREAM_SPLIT = 5
+            DIRECT = 6
+            DIRECT_V2 = 7
+            DICTIONARY_V2 = 8
+
     cdef cppclass column_name_info:
         string name
         vector[column_name_info] children
@@ -81,6 +94,9 @@ cdef extern from "cudf/io/types.hpp" \
         column_in_metadata& set_decimal_precision(uint8_t precision)
         column_in_metadata& child(size_type i)
         column_in_metadata& set_output_as_binary(bool binary)
+        column_in_metadata& set_type_length(int32_t type_length)
+        column_in_metadata& set_skip_compression(bool skip)
+        column_in_metadata& set_encoding(column_encoding enc)
         string get_name()
 
     cdef cppclass table_input_metadata:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 9f3f756a1e7..1f530aa3108 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6707,6 +6707,10 @@ def to_parquet(
         return_metadata=False,
         use_dictionary=True,
         header_version="1.0",
+        skip_compression=None,
+        column_encoding=None,
+        column_type_length=None,
+        output_as_binary=None,
         *args,
         **kwargs,
     ):
@@ -6733,6 +6737,10 @@ def to_parquet(
             return_metadata=return_metadata,
             use_dictionary=use_dictionary,
             header_version=header_version,
+            skip_compression=skip_compression,
+            column_encoding=column_encoding,
+            column_type_length=column_type_length,
+            output_as_binary=output_as_binary,
             *args,
             **kwargs,
         )
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index a6c67d22af7..dbdb2093b72 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -69,6 +69,10 @@ def _write_parquet(
     force_nullable_schema=False,
     header_version="1.0",
     use_dictionary=True,
+    skip_compression=None,
+    column_encoding=None,
+    column_type_length=None,
+    output_as_binary=None,
 ):
     if is_list_like(paths) and len(paths) > 1:
         if partitions_info is None:
@@ -102,6 +106,10 @@ def _write_parquet(
         "force_nullable_schema": force_nullable_schema,
         "header_version": header_version,
         "use_dictionary": use_dictionary,
+        "skip_compression": skip_compression,
+        "column_encoding": column_encoding,
+        "column_type_length": column_type_length,
+        "output_as_binary": output_as_binary,
     }
     if all(ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs):
         with ExitStack() as stack:
@@ -140,6 +148,12 @@ def write_to_dataset(
     max_page_size_rows=None,
     storage_options=None,
     force_nullable_schema=False,
+    header_version="1.0",
+    use_dictionary=True,
+    skip_compression=None,
+    column_encoding=None,
+    column_type_length=None,
+    output_as_binary=None,
 ):
     """Wraps `to_parquet` to write partitioned Parquet datasets.
     For each combination of partition group and value,
@@ -204,6 +218,30 @@ def write_to_dataset(
         If True, writes all columns as `null` in schema.
         If False, columns are written as `null` if they contain null values,
         otherwise as `not null`.
+    header_version : {{'1.0', '2.0'}}, default "1.0"
+        Controls whether to use version 1.0 or version 2.0 page headers when
+        encoding. Version 1.0 is more portable, but version 2.0 enables the
+        use of newer encoding schemes.
+    force_nullable_schema : bool, default False.
+        If True, writes all columns as `null` in schema.
+        If False, columns are written as `null` if they contain null values,
+        otherwise as `not null`.
+    skip_compression : set, optional, default None
+        If a column name is present in the set, that column will not be compressed,
+        regardless of the ``compression`` setting.
+    column_encoding : dict, optional, default None
+        Sets the page encoding to use on a per-column basis. The key is a column
+        name, and the value is one of: 'PLAIN', 'DICTIONARY', 'DELTA_BINARY_PACKED',
+        'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY', 'BYTE_STREAM_SPLIT', or
+        'USE_DEFAULT'.
+    column_type_length : dict, optional, default None
+        Specifies the width in bytes of ``FIXED_LEN_BYTE_ARRAY`` column elements.
+        The key is a column name and the value is an integer. The named column
+        will be output as unannotated binary (i.e. the column will behave as if
+        ``output_as_binary`` was set).
+    output_as_binary : set, optional, default None
+        If a column name is present in the set, that column will be output as
+        unannotated binary, rather than the default 'UTF-8'.
     """
 
     fs = ioutils._ensure_filesystem(fs, root_path, storage_options)
@@ -241,6 +279,12 @@ def write_to_dataset(
             max_page_size_bytes=max_page_size_bytes,
             max_page_size_rows=max_page_size_rows,
             force_nullable_schema=force_nullable_schema,
+            header_version=header_version,
+            use_dictionary=use_dictionary,
+            skip_compression=skip_compression,
+            column_encoding=column_encoding,
+            column_type_length=column_type_length,
+            output_as_binary=output_as_binary,
         )
 
     else:
@@ -262,6 +306,12 @@ def write_to_dataset(
             max_page_size_bytes=max_page_size_bytes,
             max_page_size_rows=max_page_size_rows,
             force_nullable_schema=force_nullable_schema,
+            header_version=header_version,
+            use_dictionary=use_dictionary,
+            skip_compression=skip_compression,
+            column_encoding=column_encoding,
+            column_type_length=column_type_length,
+            output_as_binary=output_as_binary,
         )
 
     return metadata
@@ -906,6 +956,10 @@ def to_parquet(
     force_nullable_schema=False,
     header_version="1.0",
     use_dictionary=True,
+    skip_compression=None,
+    column_encoding=None,
+    column_type_length=None,
+    output_as_binary=None,
     *args,
     **kwargs,
 ):
@@ -955,6 +1009,12 @@ def to_parquet(
                 return_metadata=return_metadata,
                 storage_options=storage_options,
                 force_nullable_schema=force_nullable_schema,
+                header_version=header_version,
+                use_dictionary=use_dictionary,
+                skip_compression=skip_compression,
+                column_encoding=column_encoding,
+                column_type_length=column_type_length,
+                output_as_binary=output_as_binary,
             )
 
         partition_info = (
@@ -983,6 +1043,10 @@ def to_parquet(
             force_nullable_schema=force_nullable_schema,
             header_version=header_version,
             use_dictionary=use_dictionary,
+            skip_compression=skip_compression,
+            column_encoding=column_encoding,
+            column_type_length=column_type_length,
+            output_as_binary=output_as_binary,
         )
 
     else:
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index b2896d55b80..e32fdacd8d6 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2870,6 +2870,82 @@ def flba(i):
     assert_eq(expect, got)
 
 
+def test_parquet_flba_round_trip(tmpdir):
+    def flba(i):
+        hasher = hashlib.sha256()
+        hasher.update(i.to_bytes(4, "little"))
+        return hasher.digest()
+
+    # use pyarrow to write table of fixed_len_byte_array
+    num_rows = 200
+    data = pa.array([flba(i) for i in range(num_rows)], type=pa.binary(32))
+    padf = pa.Table.from_arrays([data], names=["flba"])
+    padf_fname = tmpdir.join("padf.parquet")
+    pq.write_table(padf, padf_fname)
+
+    # round trip data with cudf
+    cdf = cudf.read_parquet(padf_fname)
+    cdf_fname = tmpdir.join("cdf.parquet")
+    cdf.to_parquet(cdf_fname, column_type_length={"flba": 32})
+
+    # now read back in with pyarrow to test it was written properly by cudf
+    padf2 = pq.read_table(padf_fname)
+    padf3 = pq.read_table(cdf_fname)
+    assert_eq(padf2, padf3)
+    assert_eq(padf2.schema[0].type, padf3.schema[0].type)
+
+
+@pytest.mark.parametrize(
+    "encoding",
+    [
+        "PLAIN",
+        "DICTIONARY",
+        "DELTA_BINARY_PACKED",
+        "BYTE_STREAM_SPLIT",
+        "USE_DEFAULT",
+    ],
+)
+def test_per_column_options(tmpdir, encoding):
+    pdf = pd.DataFrame({"ilist": [[1, 2, 3, 1, 2, 3]], "i1": [1]})
+    cdf = cudf.from_pandas(pdf)
+    fname = tmpdir.join("ilist.parquet")
+    cdf.to_parquet(
+        fname,
+        column_encoding={"ilist.list.element": encoding},
+        compression="SNAPPY",
+        skip_compression={"ilist.list.element"},
+    )
+    # DICTIONARY and USE_DEFAULT should both result in a PLAIN_DICTIONARY encoding in parquet
+    encoding_name = (
+        "PLAIN_DICTIONARY"
+        if encoding == "DICTIONARY" or encoding == "USE_DEFAULT"
+        else encoding
+    )
+    pf = pq.ParquetFile(fname)
+    fmd = pf.metadata
+    assert encoding_name in fmd.row_group(0).column(0).encodings
+    assert fmd.row_group(0).column(0).compression == "UNCOMPRESSED"
+    assert fmd.row_group(0).column(1).compression == "SNAPPY"
+
+
+@pytest.mark.parametrize(
+    "encoding",
+    ["DELTA_LENGTH_BYTE_ARRAY", "DELTA_BYTE_ARRAY"],
+)
+def test_per_column_options_string_col(tmpdir, encoding):
+    pdf = pd.DataFrame({"s": ["a string"], "i1": [1]})
+    cdf = cudf.from_pandas(pdf)
+    fname = tmpdir.join("strcol.parquet")
+    cdf.to_parquet(
+        fname,
+        column_encoding={"s": encoding},
+        compression="SNAPPY",
+    )
+    pf = pq.ParquetFile(fname)
+    fmd = pf.metadata
+    assert encoding in fmd.row_group(0).column(0).encodings
+
+
 def test_parquet_reader_rle_boolean(datadir):
     fname = datadir / "rle_boolean_encoding.parquet"
 
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index dd9b44c5a53..1366a0b8e84 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -306,6 +306,22 @@
     If True, writes all columns as `null` in schema.
     If False, columns are written as `null` if they contain null values,
     otherwise as `not null`.
+skip_compression : set, optional, default None
+    If a column name is present in the set, that column will not be compressed,
+    regardless of the ``compression`` setting.
+column_encoding : dict, optional, default None
+    Sets the page encoding to use on a per-column basis. The key is a column
+    name, and the value is one of: 'PLAIN', 'DICTIONARY', 'DELTA_BINARY_PACKED',
+    'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY', 'BYTE_STREAM_SPLIT', or
+    'USE_DEFAULT'.
+column_type_length : dict, optional, default None
+    Specifies the width in bytes of ``FIXED_LEN_BYTE_ARRAY`` column elements.
+    The key is a column name and the value is an integer. The named column
+    will be output as unannotated binary (i.e. the column will behave as if
+    ``output_as_binary`` was set).
+output_as_binary : set, optional, default None
+    If a column name is present in the set, that column will be output as
+    unannotated binary, rather than the default 'UTF-8'.
 **kwargs
     Additional parameters will be passed to execution engines other
     than ``cudf``.

From a5f6fa3674ed91713adf390954fb7234618201fa Mon Sep 17 00:00:00 2001
From: Mohamed Thabet <thabetx@gmail.com>
Date: Wed, 22 May 2024 23:20:58 +0300
Subject: [PATCH 264/272] Fix spaces around CSV quoted strings (#15727)

This PR adds an option to CSV parsing to detect quotes even if they are surrounded by whitespaces.

Current behavior when `options.keepquotes == false`:
- `"A"` ->  `A`
- `  "A"  ` -> `  "A"  ` (The spaces around the 'A' are not removed and the quotes are kept)

New behavior after enabling the new option:
- `"A"` -> `A`
- `  "A"  ` -> `A`

The new option is false by default to avoid breaking any code that relied on the old behavior.

Closes #13892.

Authors:
  - Mohamed Thabet (https://github.com/thabetx)
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15727
---
 cpp/include/cudf/io/csv.hpp                   | 35 ++++++++++++++++
 cpp/src/io/csv/csv_gpu.cu                     | 16 ++++++--
 cpp/src/io/csv/reader_impl.cu                 |  6 ++-
 cpp/src/io/utilities/parsing_utils.cuh        |  3 ++
 cpp/tests/io/csv_test.cpp                     | 41 +++++++++++++++++++
 .../cudf/_lib/pylibcudf/libcudf/io/csv.pxd    |  3 ++
 6 files changed, 99 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index fdceda40e92..a20f75cecd7 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -106,6 +106,9 @@ class csv_reader_options {
   char _quotechar = '"';
   // Whether a quote inside a value is double-quoted
   bool _doublequote = true;
+  // Whether to detect quotes surrounded by spaces e.g. `   "data"   `. This flag has no effect when
+  // _doublequote is true
+  bool _detect_whitespace_around_quotes = false;
   // Names of columns to read as datetime
   std::vector<std::string> _parse_dates_names;
   // Indexes of columns to read as datetime
@@ -375,6 +378,17 @@ class csv_reader_options {
    */
   [[nodiscard]] bool is_enabled_doublequote() const { return _doublequote; }
 
+  /**
+   * @brief Whether to detect quotes surrounded by spaces e.g. `   "data"   `. This flag has no
+   * effect when _doublequote is true
+   *
+   * @return `true` if detect_whitespace_around_quotes is enabled
+   */
+  [[nodiscard]] bool is_enabled_detect_whitespace_around_quotes() const
+  {
+    return _detect_whitespace_around_quotes;
+  }
+
   /**
    * @brief Returns names of columns to read as datetime.
    *
@@ -698,6 +712,14 @@ class csv_reader_options {
    */
   void enable_doublequote(bool val) { _doublequote = val; }
 
+  /**
+   * @brief Sets whether to detect quotes surrounded by spaces e.g. `   "data"   `. This flag has no
+   * effect when _doublequote is true
+   *
+   * @param val Boolean value to enable/disable
+   */
+  void enable_detect_whitespace_around_quotes(bool val) { _detect_whitespace_around_quotes = val; }
+
   /**
    * @brief Sets names of columns to read as datetime.
    *
@@ -1126,6 +1148,19 @@ class csv_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets whether to detect quotes surrounded by spaces e.g. `   "data"   `. This flag has no
+   * effect when _doublequote is true
+   *
+   * @param val Boolean value to enable/disable
+   * @return this for chaining
+   */
+  csv_reader_options_builder& detect_whitespace_around_quotes(bool val)
+  {
+    options._detect_whitespace_around_quotes = val;
+    return *this;
+  }
+
   /**
    * @brief Sets names of columns to read as datetime.
    *
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index 9c186f161b3..7a05d0aebaf 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -351,9 +351,19 @@ CUDF_KERNEL void __launch_bounds__(csvparse_block_dim)
         if (dtypes[actual_col].id() == cudf::type_id::STRING) {
           auto end = next_delimiter;
           if (not options.keepquotes) {
-            if ((*field_start == options.quotechar) && (*(end - 1) == options.quotechar)) {
-              ++field_start;
-              --end;
+            if (not options.detect_whitespace_around_quotes) {
+              if ((*field_start == options.quotechar) && (*(end - 1) == options.quotechar)) {
+                ++field_start;
+                --end;
+              }
+            } else {
+              // If the string is quoted, whitespace around the quotes get removed as well
+              auto const trimmed_field = trim_whitespaces(field_start, end);
+              if ((*trimmed_field.first == options.quotechar) &&
+                  (*(trimmed_field.second - 1) == options.quotechar)) {
+                field_start = trimmed_field.first + 1;
+                end         = trimmed_field.second - 1;
+              }
             }
           }
           auto str_list = static_cast<std::pair<char const*, size_t>*>(columns[actual_col]);
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 67c1194578a..5dee0c17a33 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -951,8 +951,10 @@ parse_options make_parse_options(csv_reader_options const& reader_opts,
   parse_opts.terminator = reader_opts.get_lineterminator();
 
   if (reader_opts.get_quotechar() != '\0' && reader_opts.get_quoting() != quote_style::NONE) {
-    parse_opts.quotechar   = reader_opts.get_quotechar();
-    parse_opts.keepquotes  = false;
+    parse_opts.quotechar  = reader_opts.get_quotechar();
+    parse_opts.keepquotes = false;
+    parse_opts.detect_whitespace_around_quotes =
+      reader_opts.is_enabled_detect_whitespace_around_quotes();
     parse_opts.doublequote = reader_opts.is_enabled_doublequote();
   } else {
     parse_opts.quotechar   = '\0';
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 06a0a63c0ab..faee05541cc 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -63,6 +63,7 @@ struct parse_options_view {
   char thousands;
   char comment;
   bool keepquotes;
+  bool detect_whitespace_around_quotes;
   bool doublequote;
   bool dayfirst;
   bool skipblanklines;
@@ -80,6 +81,7 @@ struct parse_options {
   char thousands;
   char comment;
   bool keepquotes;
+  bool detect_whitespace_around_quotes;
   bool doublequote;
   bool dayfirst;
   bool skipblanklines;
@@ -105,6 +107,7 @@ struct parse_options {
             thousands,
             comment,
             keepquotes,
+            detect_whitespace_around_quotes,
             doublequote,
             dayfirst,
             skipblanklines,
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 8e3ecd817e4..880dc911954 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -1018,6 +1018,47 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored)
     view.column(1));
 }
 
+TEST_F(CsvReaderTest, StringsQuotesWhitespace)
+{
+  std::vector<std::string> names{"line", "verse"};
+
+  auto filepath = temp_env->get_temp_dir() + "StringsQuotesIgnored.csv";
+  {
+    std::ofstream outfile(filepath, std::ofstream::out);
+    outfile << names[0] << ',' << names[1] << '\n';
+    outfile << "A,a" << '\n';              // unquoted no whitespace
+    outfile << "    B,b" << '\n';          // unquoted leading whitespace
+    outfile << "C    ,c" << '\n';          // unquoted trailing whitespace
+    outfile << "    D    ,d" << '\n';      // unquoted leading and trailing whitespace
+    outfile << "\"E\",e" << '\n';          // quoted no whitespace
+    outfile << "\"F\"    ,f" << '\n';      // quoted trailing whitespace
+    outfile << "    \"G\",g" << '\n';      // quoted leading whitespace
+    outfile << "    \"H\"    ,h" << '\n';  // quoted leading and trailing whitespace
+    outfile << "    \"    I    \"    ,i"
+            << '\n';  // quoted leading and trailing whitespace with spaces inside quotes
+  }
+
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
+      .names(names)
+      .dtypes(std::vector<data_type>{dtype<cudf::string_view>(), dtype<cudf::string_view>()})
+      .quoting(cudf::io::quote_style::ALL)
+      .doublequote(false)
+      .detect_whitespace_around_quotes(true);
+  auto result = cudf::io::read_csv(in_opts);
+
+  auto const view = result.tbl->view();
+  ASSERT_EQ(2, view.num_columns());
+  ASSERT_EQ(type_id::STRING, view.column(0).type().id());
+  ASSERT_EQ(type_id::STRING, view.column(1).type().id());
+
+  expect_column_data_equal(
+    std::vector<std::string>{"A", "    B", "C    ", "    D    ", "E", "F", "G", "H", "    I    "},
+    view.column(0));
+  expect_column_data_equal(std::vector<std::string>{"a", "b", "c", "d", "e", "f", "g", "h", "i"},
+                           view.column(1));
+}
+
 TEST_F(CsvReaderTest, SkiprowsNrows)
 {
   auto filepath = temp_env->get_temp_dir() + "SkiprowsNrows.csv";
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd
index 754dd37d53f..b5ff6558cd8 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd
@@ -50,6 +50,7 @@ cdef extern from "cudf/io/csv.hpp" \
         cudf_io_types.quote_style get_quoting() except +
         char get_quotechar() except +
         bool is_enabled_doublequote() except +
+        bool is_enabled_updated_quotes_detection() except +
         vector[string] get_parse_dates_names() except +
         vector[int] get_parse_dates_indexes() except +
         vector[string] get_parse_hex_names() except +
@@ -95,6 +96,7 @@ cdef extern from "cudf/io/csv.hpp" \
         void set_quoting(cudf_io_types.quote_style style) except +
         void set_quotechar(char val) except +
         void set_doublequote(bool val) except +
+        void set_detect_whitespace_around_quotes(bool val) except +
         void set_parse_dates(vector[string]) except +
         void set_parse_dates(vector[int]) except +
         void set_parse_hex(vector[string]) except +
@@ -163,6 +165,7 @@ cdef extern from "cudf/io/csv.hpp" \
         ) except +
         csv_reader_options_builder& quotechar(char val) except +
         csv_reader_options_builder& doublequote(bool val) except +
+        csv_reader_options_builder& detect_whitespace_around_quotes(bool val) except +
         csv_reader_options_builder& parse_dates(vector[string]) except +
         csv_reader_options_builder& parse_dates(vector[int]) except +
 

From f6cca5086c5eaeff7971813a3ca557a1708f4225 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 22 May 2024 10:47:24 -1000
Subject: [PATCH 265/272] Push some as_column arrow logic to
 ColumnBase.from_arrow (#15738)

`as_column` and `ColumnBase.from_arrow` have similar checks for handling `pa.Array` objects so consolidating them to
`ColumnBase.from_arrow` as `as_column` calls to that eventually.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15738
---
 python/cudf/cudf/core/column/column.py | 76 ++++++++------------------
 1 file changed, 23 insertions(+), 53 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 1785eb834b2..59bae179497 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -333,16 +333,27 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
         """
         if not isinstance(array, (pa.Array, pa.ChunkedArray)):
             raise TypeError("array should be PyArrow array or chunked array")
-
-        data = pa.table([array], [None])
-
-        if (
-            isinstance(array.type, pa.TimestampType)
-            and array.type.tz is not None
-        ):
+        elif pa.types.is_float16(array.type):
+            raise NotImplementedError(
+                "Type casting from `float16` to `float32` is not "
+                "yet supported in pyarrow, see: "
+                "https://github.com/apache/arrow/issues/20213"
+            )
+        elif pa.types.is_timestamp(array.type) and array.type.tz is not None:
             raise NotImplementedError(
                 "cuDF does not yet support timezone-aware datetimes"
             )
+        elif isinstance(array.type, ArrowIntervalType):
+            return cudf.core.column.IntervalColumn.from_arrow(array)
+        elif pa.types.is_large_string(array.type):
+            # Pandas-2.2+: Pandas defaults to `large_string` type
+            # instead of `string` without data-introspection.
+            # Temporary workaround until cudf has native
+            # support for `LARGE_STRING` i.e., 64 bit offsets
+            array = array.cast(pa.string())
+
+        data = pa.table([array], [None])
+
         if isinstance(array.type, pa.DictionaryType):
             indices_table = pa.table(
                 {
@@ -371,8 +382,6 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
                 size=codes.size,
                 ordered=array.type.ordered,
             )
-        elif isinstance(array.type, ArrowIntervalType):
-            return cudf.core.column.IntervalColumn.from_arrow(array)
 
         result = libcudf.interop.from_arrow(data)[0]
 
@@ -1809,27 +1818,7 @@ def as_column(
         return col
 
     elif isinstance(arbitrary, (pa.Array, pa.ChunkedArray)):
-        if pa.types.is_large_string(arbitrary.type):
-            # Pandas-2.2+: Pandas defaults to `large_string` type
-            # instead of `string` without data-introspection.
-            # Temporary workaround until cudf has native
-            # support for `LARGE_STRING` i.e., 64 bit offsets
-            arbitrary = arbitrary.cast(pa.string())
-
-        if pa.types.is_float16(arbitrary.type):
-            raise NotImplementedError(
-                "Type casting from `float16` to `float32` is not "
-                "yet supported in pyarrow, see: "
-                "https://github.com/apache/arrow/issues/20213"
-            )
-        elif (
-            pa.types.is_timestamp(arbitrary.type)
-            and arbitrary.type.tz is not None
-        ):
-            raise NotImplementedError(
-                "cuDF does not yet support timezone-aware datetimes"
-            )
-        elif (nan_as_null is None or nan_as_null) and pa.types.is_floating(
+        if (nan_as_null is None or nan_as_null) and pa.types.is_floating(
             arbitrary.type
         ):
             arbitrary = pc.if_else(
@@ -1837,31 +1826,12 @@ def as_column(
                 pa.nulls(len(arbitrary), type=arbitrary.type),
                 arbitrary,
             )
+        elif dtype is None and pa.types.is_null(arbitrary.type):
+            # default "empty" type
+            dtype = "str"
         col = ColumnBase.from_arrow(arbitrary)
 
-        if isinstance(arbitrary, pa.NullArray):
-            if dtype is not None:
-                # Cast the column to the `dtype` if specified.
-                new_dtype = dtype
-            elif len(arbitrary) == 0:
-                # If the column is empty, it has to be
-                # a `str` dtype.
-                new_dtype = cudf.dtype("str")
-            else:
-                # If the null column is not empty, it has to
-                # be of `object` dtype.
-                new_dtype = cudf.dtype(arbitrary.type.to_pandas_dtype())
-
-            if cudf.get_option(
-                "mode.pandas_compatible"
-            ) and new_dtype == cudf.dtype("O"):
-                # We internally raise if we do `astype("object")`, hence
-                # need to cast to `str` since this is safe to do so because
-                # it is a null-array.
-                new_dtype = "str"
-
-            col = col.astype(new_dtype)
-        elif dtype is not None:
+        if dtype is not None:
             col = col.astype(dtype)
 
         return col

From 1710e11c3ae9dd072305ca49e12e10d0f2e3aec0 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Thu, 23 May 2024 08:59:55 -0500
Subject: [PATCH 266/272] Return boolean from config_host_memory_resource
 instead of throwing (#15815)

Closes https://github.com/rapidsai/cudf/issues/15814

This adds a boolean return value from `cudf::io::config_host_memory_resource` to allow the caller to handle the case where the memory resource has already been configured in the past. Before this the function would throw, forcing callers to try/catch.

Authors:
  - Alessandro Bellina (https://github.com/abellina)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - https://github.com/nvdbaranec
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15815
---
 cpp/include/cudf/io/memory_resource.hpp       |  4 +++-
 cpp/src/io/utilities/config_utils.cpp         | 20 ++++++++++++-------
 .../java/ai/rapids/cudf/PinnedMemoryPool.java |  7 +++++--
 java/src/main/java/ai/rapids/cudf/Rmm.java    |  5 ++++-
 java/src/main/native/src/RmmJni.cpp           | 11 +++++-----
 5 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/cpp/include/cudf/io/memory_resource.hpp b/cpp/include/cudf/io/memory_resource.hpp
index e31ebce4b1f..a36e220ae7b 100644
--- a/cpp/include/cudf/io/memory_resource.hpp
+++ b/cpp/include/cudf/io/memory_resource.hpp
@@ -57,7 +57,9 @@ struct host_mr_options {
  * @throws cudf::logic_error if called after the default host memory resource has been created
  *
  * @param opts Options to configure the default host memory resource
+ * @return True if this call successfully configured the host memory resource, false if a
+ * a resource was already configured.
  */
-void config_default_host_memory_resource(host_mr_options const& opts);
+bool config_default_host_memory_resource(host_mr_options const& opts);
 
 }  // namespace cudf::io
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
index 7720c073a97..dad1135e766 100644
--- a/cpp/src/io/utilities/config_utils.cpp
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -244,16 +244,20 @@ CUDF_EXPORT std::mutex& host_mr_mutex()
 }
 
 // Must be called with the host_mr_mutex mutex held
-CUDF_EXPORT rmm::host_async_resource_ref& make_host_mr(std::optional<host_mr_options> const& opts)
+CUDF_EXPORT rmm::host_async_resource_ref& make_host_mr(std::optional<host_mr_options> const& opts,
+                                                       bool* did_configure = nullptr)
 {
   static rmm::host_async_resource_ref* mr_ref = nullptr;
+  bool configured                             = false;
   if (mr_ref == nullptr) {
-    mr_ref = &make_default_pinned_mr(opts ? opts->pool_size : std::nullopt);
-  } else {
-    // Throw an error if the user tries to reconfigure the default host resource
-    CUDF_EXPECTS(opts == std::nullopt, "The default host memory resource has already been created");
+    configured = true;
+    mr_ref     = &make_default_pinned_mr(opts ? opts->pool_size : std::nullopt);
   }
 
+  // If the user passed an out param to detect whether this call configured a resource
+  // set the result
+  if (did_configure != nullptr) { *did_configure = configured; }
+
   return *mr_ref;
 }
 
@@ -278,10 +282,12 @@ rmm::host_async_resource_ref get_host_memory_resource()
   return host_mr();
 }
 
-void config_default_host_memory_resource(host_mr_options const& opts)
+bool config_default_host_memory_resource(host_mr_options const& opts)
 {
   std::scoped_lock lock{host_mr_mutex()};
-  make_host_mr(opts);
+  auto did_configure = false;
+  make_host_mr(opts, &did_configure);
+  return did_configure;
 }
 
 }  // namespace cudf::io
diff --git a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
index 9038700cb30..83b801db7fb 100644
--- a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
+++ b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
@@ -260,9 +260,12 @@ private synchronized void free(long address, long size) {
    *
    * @param size initial and maximum size for the cuDF default pinned pool.
    *        Pass size=0 to disable the default pool.
+   *
+   * @return true if we were able to setup the default resource, false if there was
+   *         a resource already set.
    */
-  public static synchronized void configureDefaultCudfPinnedPoolSize(long size) {
-    Rmm.configureDefaultCudfPinnedPoolSize(size);
+  public static synchronized boolean configureDefaultCudfPinnedPoolSize(long size) {
+    return Rmm.configureDefaultCudfPinnedPoolSize(size);
   }
 
 }
diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index fdbdfdfff6f..4dee1b7aa24 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -273,8 +273,11 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
    *
    * @param size initial and maximum size for the cuDF default pinned pool.
    *        Pass size=0 to disable the default pool.
+   *
+   * @return true if we were able to setup the default resource, false if there was
+   *         a resource already set.
    */
-  public static synchronized native void configureDefaultCudfPinnedPoolSize(long size);
+  public static synchronized native boolean configureDefaultCudfPinnedPoolSize(long size);
 
   /**
    * Get the most recently set pool size or -1 if RMM has not been initialized or pooling is
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 9c015fee409..fa78f6ca4e2 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -1035,7 +1035,6 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCuioPinnedPoolMemoryResource(J
     // create a pinned fallback pool that will allocate pinned memory
     // if the regular pinned pool is exhausted
     pinned_fallback_mr.reset(new pinned_fallback_host_memory_resource(pool));
-    // set the cuio host mr and store the prior resource in our static variable
     prior_cuio_host_mr() = cudf::io::set_host_memory_resource(*pinned_fallback_mr);
   }
   CATCH_STD(env, )
@@ -1107,14 +1106,14 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeFromFallbackPinnedPool(JNIEnv
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_configureDefaultCudfPinnedPoolSize(JNIEnv* env,
-                                                                                  jclass clazz,
-                                                                                  jlong size)
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Rmm_configureDefaultCudfPinnedPoolSize(JNIEnv* env,
+                                                                                      jclass clazz,
+                                                                                      jlong size)
 {
   try {
     cudf::jni::auto_set_device(env);
-    cudf::io::config_default_host_memory_resource(cudf::io::host_mr_options{size});
+    return cudf::io::config_default_host_memory_resource(cudf::io::host_mr_options{size});
   }
-  CATCH_STD(env, )
+  CATCH_STD(env, false)
 }
 }

From f873e238aa0e611f6352f7c91501a562eeaa6437 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 23 May 2024 18:32:58 -0400
Subject: [PATCH 267/272] Use rapids_cpm_nvtx3 to get same nvtx3 target state
 as rmm (#15840)

We need to use the `rapids_cpm_nvtx3` so that the nvtx3 targets, and setup are consistent across rmm and cudf. If we don't we get errors around incorrect exports when building statically or link errors when building shared.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Nghia Truong (https://github.com/ttnghia)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/15840
---
 cpp/CMakeLists.txt                  |  4 ++--
 cpp/benchmarks/CMakeLists.txt       |  2 +-
 cpp/cmake/thirdparty/get_nvtx.cmake | 16 +++++++---------
 cpp/tests/CMakeLists.txt            |  4 ++--
 java/src/main/native/CMakeLists.txt |  2 +-
 5 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7390c465ccb..1eab51c8827 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -794,8 +794,8 @@ add_dependencies(cudf jitify_preprocess_run)
 target_link_libraries(
   cudf
   PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm
-  PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3-cpp> cuco::cuco ZLIB::ZLIB nvcomp::nvcomp kvikio::kvikio
-          $<TARGET_NAME_IF_EXISTS:cuFile_interface> nanoarrow
+  PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3::nvtx3-cpp> cuco::cuco ZLIB::ZLIB nvcomp::nvcomp
+          kvikio::kvikio $<TARGET_NAME_IF_EXISTS:cuFile_interface> nanoarrow
 )
 
 # Add Conda library, and include paths if specified
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 170cf27b72b..10f645dfec0 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -25,7 +25,7 @@ target_compile_options(
 target_link_libraries(
   cudf_datagen
   PUBLIC GTest::gmock GTest::gtest benchmark::benchmark nvbench::nvbench Threads::Threads cudf
-         cudftestutil nvtx3-cpp
+         cudftestutil nvtx3::nvtx3-cpp
   PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
 )
 
diff --git a/cpp/cmake/thirdparty/get_nvtx.cmake b/cpp/cmake/thirdparty/get_nvtx.cmake
index c722c4f70f1..e236d586522 100644
--- a/cpp/cmake/thirdparty/get_nvtx.cmake
+++ b/cpp/cmake/thirdparty/get_nvtx.cmake
@@ -12,16 +12,14 @@
 # the License.
 # =============================================================================
 
-# This function finds NVTX and sets any additional necessary environment variables.
+# Need to call rapids_cpm_nvtx3 to get support for an installed version of nvtx3 and to support
+# installing it ourselves
 function(find_and_configure_nvtx)
-  rapids_cpm_find(
-    NVTX3 3.1.0
-    GLOBAL_TARGETS nvtx3-c nvtx3-cpp
-    CPM_ARGS
-    GIT_REPOSITORY https://github.com/NVIDIA/NVTX.git
-    GIT_TAG v3.1.0
-    GIT_SHALLOW TRUE SOURCE_SUBDIR c
-  )
+  include(${rapids-cmake-dir}/cpm/nvtx3.cmake)
+
+  # Find or install nvtx3
+  rapids_cpm_nvtx3(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports)
+
 endfunction()
 
 find_and_configure_nvtx()
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index db934818ae7..7db9a06e809 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -56,8 +56,8 @@ function(ConfigureTest CMAKE_TEST_NAME)
 
   target_link_libraries(
     ${CMAKE_TEST_NAME}
-    PRIVATE cudftestutil GTest::gmock GTest::gmock_main GTest::gtest GTest::gtest_main nvtx3-cpp
-            $<TARGET_NAME_IF_EXISTS:conda_env> "${_CUDF_TEST_EXTRA_LIB}"
+    PRIVATE cudftestutil GTest::gmock GTest::gmock_main GTest::gtest GTest::gtest_main
+            nvtx3::nvtx3-cpp $<TARGET_NAME_IF_EXISTS:conda_env> "${_CUDF_TEST_EXTRA_LIB}"
   )
   rapids_cuda_set_runtime(${CMAKE_TEST_NAME} USE_STATIC ${CUDA_STATIC_RUNTIME})
   rapids_test_add(
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 0d5339a1402..56f8f9d0472 100644
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -239,7 +239,7 @@ endif()
 # When nvcomp is installed we need to use nvcomp::nvcomp but from the cudf build directory it will
 # just be nvcomp.
 target_link_libraries(
-  cudfjni ${CUDF_LINK} PRIVATE nvtx3-cpp $<TARGET_NAME_IF_EXISTS:nvcomp>
+  cudfjni ${CUDF_LINK} PRIVATE nvtx3::nvtx3-cpp $<TARGET_NAME_IF_EXISTS:nvcomp>
                                $<TARGET_NAME_IF_EXISTS:nvcomp::nvcomp>
 )
 

From 8b5ff188e79bb79ca0c2d581e94d3a91654a2d31 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Thu, 23 May 2024 20:32:30 -0400
Subject: [PATCH 268/272] Remove problematic call of index setter to unblock
 dask-cuda CI (#15844)

Lighter weight alternative to https://github.com/rapidsai/cudf/pull/15843 to unblock dask-cuda's breakage.

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15844
---
 python/cudf/cudf/core/indexed_frame.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index a166c256689..394904c5855 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -350,7 +350,8 @@ def _from_columns_like_self(
         frame = self.__class__._from_data(data)
 
         if index is not None:
-            frame.index = index
+            # TODO: triage why using the setter here breaks dask_cuda.ProxifyHostFile
+            frame._index = index
         return frame._copy_type_metadata(
             self,
             include_index=bool(index_names),

From 8a405674a5ba1554a0ced5d1f39f89fb424a768d Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 24 May 2024 11:24:39 -0500
Subject: [PATCH 269/272] Fix docs for IO readers and strings_convert (#15842)

Fixes documentation for IO readers and strings_convert.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15842
---
 docs/cudf/source/libcudf_docs/api_docs/io_readers.rst      | 2 +-
 docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst b/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst
index a835673dee4..f94a5ddb403 100644
--- a/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst
+++ b/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst
@@ -2,4 +2,4 @@ Io Readers
 ==========
 
 .. doxygengroup:: io_readers
-   :desc-only:
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst
index ae5d78fb1a1..f2f320bd0e4 100644
--- a/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst
@@ -2,4 +2,4 @@ Strings Convert
 ===============
 
 .. doxygengroup:: strings_convert
-   :desc-only:
+   :members:

From 29429f7e4c871758c0de930026347e6e3b0a5a9a Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 28 May 2024 05:47:58 -0700
Subject: [PATCH 270/272] Work around issues with cccl main (#15552)

This gets cuDF build cccl main on 12.3.

There is one issue with the cuco tuple helpers but that will be fixed on
the cuco side

---------

Co-authored-by: Bernhard Manfred Gruber <bgruber@nvidia.com>
Co-authored-by: Bradley Dice <bdice@bradleydice.com>
Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Co-authored-by: ptaylor <paul.e.taylor@me.com>
Co-authored-by: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Co-authored-by: Yunsong Wang <yunsongw@nvidia.com>
---
 .pre-commit-config.yaml                       |  2 +
 .../thirdparty/patches/cccl_override.json     | 20 ++++++++
 .../patches/revert_pr_211_cccl_2.5.0.diff     | 47 +++++++++++++++++++
 ..._disable_64bit_dispatching_cccl_2.5.0.diff | 25 ++++++++++
 ..._faster_scan_compile_times_cccl_2.5.0.diff | 39 +++++++++++++++
 ..._faster_sort_compile_times_cccl_2.5.0.diff | 39 +++++++++++++++
 cpp/src/io/comp/statistics.cu                 |  9 ++--
 cpp/src/io/orc/reader_impl_decode.cu          |  3 +-
 cpp/src/io/orc/stripe_init.cu                 | 22 +++++----
 cpp/src/io/parquet/page_string_decode.cu      | 13 +++--
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  4 +-
 cpp/src/io/utilities/data_casting.cu          |  6 ++-
 cpp/src/join/distinct_hash_join.cu            |  2 +-
 cpp/src/strings/split/split_re.cu             |  4 +-
 cpp/tests/hash_map/map_test.cu                |  1 -
 15 files changed, 209 insertions(+), 27 deletions(-)
 create mode 100644 cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff
 create mode 100644 cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff
 create mode 100644 cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff
 create mode 100644 cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5a8d9f54673..2d3ffc287e9 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,11 +7,13 @@ repos:
       - id: trailing-whitespace
         exclude: |
           (?x)^(
+            ^cpp/cmake/thirdparty/patches/.*|
             ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
           )
       - id: end-of-file-fixer
         exclude: |
           (?x)^(
+            ^cpp/cmake/thirdparty/patches/.*|
             ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
           )
   - repo: https://github.com/PyCQA/isort
diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json
index b33f17f3e4a..059f713e7a5 100644
--- a/cpp/cmake/thirdparty/patches/cccl_override.json
+++ b/cpp/cmake/thirdparty/patches/cccl_override.json
@@ -18,6 +18,11 @@
           "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
           "fixed_in" : ""
         },
+        {
+          "file" : "${current_json_dir}/revert_pr_211_cccl_2.5.0.diff",
+          "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
+          "fixed_in" : ""
+        },
         {
           "file": "cccl/kernel_pointer_hiding.diff",
           "issue": "Hide APIs that accept kernel pointers [https://github.com/NVIDIA/cccl/pull/1395]",
@@ -28,15 +33,30 @@
           "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
           "fixed_in" : ""
         },
+        {
+          "file" : "${current_json_dir}/thrust_disable_64bit_dispatching_cccl_2.5.0.diff",
+          "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
+          "fixed_in" : ""
+        },
         {
           "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff",
           "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
           "fixed_in" : ""
         },
+        {
+          "file" : "${current_json_dir}/thrust_faster_sort_compile_times_cccl_2.5.0.diff",
+          "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
+          "fixed_in" : ""
+        },
         {
           "file" : "${current_json_dir}/thrust_faster_scan_compile_times.diff",
           "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]",
           "fixed_in" : ""
+        },
+        {
+          "file" : "${current_json_dir}/thrust_faster_scan_compile_times_cccl_2.5.0.diff",
+          "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]",
+          "fixed_in" : ""
         }
       ]
     }
diff --git a/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff
new file mode 100644
index 00000000000..27ff16744f5
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff
@@ -0,0 +1,47 @@
+diff --git a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
+index 046eb83c0..8047c9701 100644
+--- a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
++++ b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
+@@ -53,41 +53,15 @@ namespace cuda_cub
+ 
+ namespace __copy
+ {
+-template <class Derived, class InputIt, class OutputIt>
+-OutputIt THRUST_RUNTIME_FUNCTION device_to_device(
+-  execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::true_type)
+-{
+-  typedef typename thrust::iterator_traits<InputIt>::value_type InputTy;
+-  const auto n = thrust::distance(first, last);
+-  if (n > 0)
+-  {
+-    cudaError status;
+-    status = trivial_copy_device_to_device(
+-      policy,
+-      reinterpret_cast<InputTy*>(thrust::raw_pointer_cast(&*result)),
+-      reinterpret_cast<InputTy const*>(thrust::raw_pointer_cast(&*first)),
+-      n);
+-    cuda_cub::throw_on_error(status, "__copy:: D->D: failed");
+-  }
+-
+-  return result + n;
+-}
+ 
+ template <class Derived, class InputIt, class OutputIt>
+ OutputIt THRUST_RUNTIME_FUNCTION device_to_device(
+-  execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::false_type)
++  execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result)
+ {
+   typedef typename thrust::iterator_traits<InputIt>::value_type InputTy;
+   return cuda_cub::transform(policy, first, last, result, thrust::identity<InputTy>());
+ }
+ 
+-template <class Derived, class InputIt, class OutputIt>
+-OutputIt THRUST_RUNTIME_FUNCTION
+-device_to_device(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result)
+-{
+-  return device_to_device(
+-    policy, first, last, result, typename is_indirectly_trivially_relocatable_to<InputIt, OutputIt>::type());
+-}
+ } // namespace __copy
+ 
+ } // namespace cuda_cub
diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff
new file mode 100644
index 00000000000..6ae1e1c917b
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff
@@ -0,0 +1,25 @@
+diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h
+index 2a3cc4e33..8fb337b26 100644
+--- a/thrust/thrust/system/cuda/detail/dispatch.h
++++ b/thrust/thrust/system/cuda/detail/dispatch.h
+@@ -44,8 +44,7 @@
+   }                                                                                   \
+   else                                                                                \
+   {                                                                                   \
+-    auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
+-    status                             = call arguments;                              \
++    throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
+   }
+ 
+ /**
+@@ -66,9 +65,7 @@
+   }                                                                                          \
+   else                                                                                       \
+   {                                                                                          \
+-    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int64_t>(count1);      \
+-    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int64_t>(count2);      \
+-    status                              = call arguments;                                    \
++    throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
+   }
+ /**
+  * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff
new file mode 100644
index 00000000000..fee46046194
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff
@@ -0,0 +1,39 @@
+diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+index 0606485bb..dbb99ff13 100644
+--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
++++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+@@ -1085,7 +1085,7 @@ struct DeviceRadixSortPolicy
+   };
+ 
+   /// SM60 (GP100)
+-  struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
++  struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
+   {
+     enum
+     {
+diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
+index f39613adb..75bd16ff9 100644
+--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
++++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
+@@ -488,7 +488,7 @@ struct DeviceReducePolicy
+   };
+ 
+   /// SM60
+-  struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
++  struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
+   {
+     static constexpr int threads_per_block  = 256;
+     static constexpr int items_per_thread   = 16;
+diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+index 419908c4e..6ab0840e1 100644
+--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
++++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+@@ -339,7 +339,7 @@ struct DeviceScanPolicy
+   /// SM600
+   struct Policy600
+       : DefaultTuning
+-      , ChainedPolicy<600, Policy600, Policy520>
++      , ChainedPolicy<600, Policy600, Policy600>
+   {};
+ 
+   /// SM800
diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff
new file mode 100644
index 00000000000..cb0cc55f4d2
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff
@@ -0,0 +1,39 @@
+diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh
+index eb76ebb0b..c6c529a50 100644
+--- a/cub/cub/block/block_merge_sort.cuh
++++ b/cub/cub/block/block_merge_sort.cuh
+@@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge(
+   KeyT key1 = keys_shared[keys1_beg];
+   KeyT key2 = keys_shared[keys2_beg];
+ 
+-#pragma unroll
++#pragma unroll 1
+   for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+   {
+     bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1));
+@@ -376,7 +376,7 @@ public:
+       //
+       KeyT max_key = oob_default;
+ 
+-#pragma unroll
++#pragma unroll 1
+       for (int item = 1; item < ITEMS_PER_THREAD; ++item)
+       {
+         if (ITEMS_PER_THREAD * linear_tid + item < valid_items)
+diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh
+index 7d9e8622f..da5627306 100644
+--- a/cub/cub/thread/thread_sort.cuh
++++ b/cub/cub/thread/thread_sort.cuh
+@@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE
+ {
+   constexpr bool KEYS_ONLY = ::cuda::std::is_same<ValueT, NullType>::value;
+ 
+-#pragma unroll
++#pragma unroll 1
+   for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+   {
+-#pragma unroll
++#pragma unroll 1
+     for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
+     {
+       if (compare_op(keys[j + 1], keys[j]))
diff --git a/cpp/src/io/comp/statistics.cu b/cpp/src/io/comp/statistics.cu
index 2a9eb782800..faf967041bc 100644
--- a/cpp/src/io/comp/statistics.cu
+++ b/cpp/src/io/comp/statistics.cu
@@ -18,6 +18,7 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/functional>
 #include <thrust/transform_reduce.h>
 
 namespace cudf::io {
@@ -32,9 +33,9 @@ writer_compression_statistics collect_compression_statistics(
     rmm::exec_policy(stream),
     results.begin(),
     results.end(),
-    [] __device__(auto& res) {
+    cuda::proclaim_return_type<size_t>([] __device__(compression_result const& res) {
       return res.status == compression_status::SUCCESS ? res.bytes_written : 0;
-    },
+    }),
     0ul,
     thrust::plus<size_t>());
 
@@ -47,9 +48,9 @@ writer_compression_statistics collect_compression_statistics(
       rmm::exec_policy(stream),
       zipped_begin,
       zipped_end,
-      [status] __device__(auto tup) {
+      cuda::proclaim_return_type<size_t>([status] __device__(auto tup) {
         return thrust::get<1>(tup).status == status ? thrust::get<0>(tup).size() : 0;
-      },
+      }),
       0ul,
       thrust::plus<size_t>());
   };
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index ec936b85761..da9fb802a0a 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -692,8 +692,7 @@ std::vector<range> find_table_splits(table_view const& input,
      d_sizes  = d_segmented_sizes->view().begin<size_type>()] __device__(auto const segment_idx) {
       // Since the number of rows may not divisible by segment_length,
       // the last segment may be shorter than the others.
-      auto const current_length =
-        cuda::std::min(segment_length, num_rows - segment_length * segment_idx);
+      auto const current_length = min(segment_length, num_rows - segment_length * segment_idx);
       auto const size = d_sizes[segment_idx] / CHAR_BIT;  // divide by CHAR_BIT to get size in bytes
       return cumulative_size{static_cast<std::size_t>(current_length),
                              static_cast<std::size_t>(size)};
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index dd44b779402..89dbbcb796c 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -561,20 +561,26 @@ void __host__ ParseCompressedStripeData(CompressedStreamInfo* strm_info,
                                         uint32_t log2maxcr,
                                         rmm::cuda_stream_view stream)
 {
-  dim3 dim_block(128, 1);
-  dim3 dim_grid((num_streams + 3) >> 2, 1);  // 1 stream per warp, 4 warps per block
-  gpuParseCompressedStripeData<<<dim_grid, dim_block, 0, stream.value()>>>(
-    strm_info, num_streams, compression_block_size, log2maxcr);
+  auto const num_blocks = (num_streams + 3) >> 2;  // 1 stream per warp, 4 warps per block
+  if (num_blocks > 0) {
+    dim3 dim_block(128, 1);
+    dim3 dim_grid(num_blocks, 1);
+    gpuParseCompressedStripeData<<<dim_grid, dim_block, 0, stream.value()>>>(
+      strm_info, num_streams, compression_block_size, log2maxcr);
+  }
 }
 
 void __host__ PostDecompressionReassemble(CompressedStreamInfo* strm_info,
                                           int32_t num_streams,
                                           rmm::cuda_stream_view stream)
 {
-  dim3 dim_block(128, 1);
-  dim3 dim_grid((num_streams + 3) >> 2, 1);  // 1 stream per warp, 4 warps per block
-  gpuPostDecompressionReassemble<<<dim_grid, dim_block, 0, stream.value()>>>(strm_info,
-                                                                             num_streams);
+  auto const num_blocks = (num_streams + 3) >> 2;  // 1 stream per warp, 4 warps per block
+  if (num_blocks > 0) {
+    dim3 dim_block(128, 1);
+    dim3 dim_grid(num_blocks, 1);
+    gpuPostDecompressionReassemble<<<dim_grid, dim_block, 0, stream.value()>>>(strm_info,
+                                                                               num_streams);
+  }
 }
 
 void __host__ ParseRowGroupIndex(RowGroup* row_groups,
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index cf1dc58b06a..ba3d35b9586 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -1197,14 +1197,17 @@ void ComputePageStringSizes(cudf::detail::hostdevice_span<PageInfo> pages,
   cudf::detail::join_streams(streams, stream);
 
   // check for needed temp space for DELTA_BYTE_ARRAY
-  auto const need_sizes = thrust::any_of(
-    rmm::exec_policy(stream), pages.device_begin(), pages.device_end(), [] __device__(auto& page) {
-      return page.temp_string_size != 0;
-    });
+  auto const need_sizes =
+    thrust::any_of(rmm::exec_policy(stream),
+                   pages.device_begin(),
+                   pages.device_end(),
+                   cuda::proclaim_return_type<bool>(
+                     [] __device__(auto& page) { return page.temp_string_size != 0; }));
 
   if (need_sizes) {
     // sum up all of the temp_string_sizes
-    auto const page_sizes = [] __device__(PageInfo const& page) { return page.temp_string_size; };
+    auto const page_sizes = cuda::proclaim_return_type<int64_t>(
+      [] __device__(PageInfo const& page) { return page.temp_string_size; });
     auto const total_size = thrust::transform_reduce(rmm::exec_policy(stream),
                                                      pages.device_begin(),
                                                      pages.device_end(),
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index f533f04e427..7cb982f103d 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -452,9 +452,9 @@ std::string encoding_to_string(Encoding encoding)
 [[nodiscard]] std::string list_unsupported_encodings(device_span<PageInfo const> pages,
                                                      rmm::cuda_stream_view stream)
 {
-  auto const to_mask = [] __device__(auto const& page) {
+  auto const to_mask = cuda::proclaim_return_type<uint32_t>([] __device__(auto const& page) {
     return is_supported_encoding(page.encoding) ? 0U : encoding_to_mask(page.encoding);
-  };
+  });
   uint32_t const unsupported = thrust::transform_reduce(
     rmm::exec_policy(stream), pages.begin(), pages.end(), to_mask, 0U, thrust::bit_or<uint32_t>());
   return encoding_bitmask_to_str(unsupported);
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index c9e507925ec..60cbfbc0dae 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -34,6 +34,7 @@
 #include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/functional>
 #include <thrust/copy.h>
 #include <thrust/functional.h>
 #include <thrust/transform_reduce.h>
@@ -783,7 +784,8 @@ template <typename SymbolT>
 struct to_string_view_pair {
   SymbolT const* data;
   to_string_view_pair(SymbolT const* _data) : data(_data) {}
-  __device__ auto operator()(thrust::tuple<size_type, size_type> ip)
+  __device__ thrust::pair<char const*, std::size_t> operator()(
+    thrust::tuple<size_type, size_type> ip)
   {
     return thrust::pair<char const*, std::size_t>{data + thrust::get<0>(ip),
                                                   static_cast<std::size_t>(thrust::get<1>(ip))};
@@ -805,7 +807,7 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
     rmm::exec_policy(stream),
     str_tuples,
     str_tuples + col_size,
-    [] __device__(auto t) { return t.second; },
+    cuda::proclaim_return_type<std::size_t>([] __device__(auto t) { return t.second; }),
     size_type{0},
     thrust::maximum<size_type>{});
 
diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index ad401bdccba..5048da25e86 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -182,7 +182,7 @@ distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
     thrust::make_transform_output_iterator(probe_indices->begin(), output_fn{});
 
   auto const [probe_indices_end, _] = this->_hash_table.retrieve(
-    iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, stream.value());
+    iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, {stream.value()});
 
   auto const actual_size = std::distance(probe_indices_begin, probe_indices_end);
   build_indices->resize(actual_size, stream);
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 6785ab9c893..d72ec1085b5 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -219,9 +219,9 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<size_type>(0),
     thrust::make_counting_iterator<size_type>(strings_count),
-    [d_offsets] __device__(auto const idx) -> size_type {
+    cuda::proclaim_return_type<size_type>([d_offsets] __device__(auto const idx) -> size_type {
       return static_cast<size_type>(d_offsets[idx + 1] - d_offsets[idx]);
-    },
+    }),
     0,
     thrust::maximum<size_type>{});
 
diff --git a/cpp/tests/hash_map/map_test.cu b/cpp/tests/hash_map/map_test.cu
index 4b10716706b..be2e33538b9 100644
--- a/cpp/tests/hash_map/map_test.cu
+++ b/cpp/tests/hash_map/map_test.cu
@@ -69,7 +69,6 @@ struct InsertTest : public cudf::test::BaseFixture {
 
 using TestTypes = ::testing::Types<key_value_types<int32_t, int32_t>,
                                    key_value_types<int64_t, int64_t>,
-                                   key_value_types<int16_t, int16_t>,
                                    key_value_types<int32_t, float>,
                                    key_value_types<int64_t, double>>;
 

From 2b031e06a7fe18eec462db445eea1c596b93a9f1 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 29 May 2024 09:21:52 -0700
Subject: [PATCH 271/272] Revert "Fix docs for IO readers and strings_convert"
 (#15872)

Reverts rapidsai/cudf#15842

The files the original PR added documentation for appear to contain some
text that is problematic for the Sphinx parser to extract from doxygen.
My best guess is that it's something in a table, since parsing doxygen
tables via Breathe is something I know can be tricky. We didn't catch
this issue because [we currently only build the text docs in nightly
builds, not
PRs](https://github.com/rapidsai/cudf/blob/branch-24.08/ci/build_docs.sh#L49),
and this issue only arises in those text builds. We can revisit adding
these docs in 24.08. For the sake of correctness, I have added back
building text docs in PRs in this PR (see #14856 for context on the
removal).
---
 ci/build_docs.sh                                 | 16 ++++++----------
 .../source/libcudf_docs/api_docs/io_readers.rst  |  2 +-
 .../libcudf_docs/api_docs/strings_convert.rst    |  2 +-
 3 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 668d52e530b..db306046667 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -46,11 +46,9 @@ pushd docs/cudf
 make dirhtml
 mkdir -p "${RAPIDS_DOCS_DIR}/cudf/html"
 mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/cudf/html"
-if [[ "${RAPIDS_BUILD_TYPE}" != "pull-request" ]]; then
-  make text
-  mkdir -p "${RAPIDS_DOCS_DIR}/cudf/txt"
-  mv build/text/* "${RAPIDS_DOCS_DIR}/cudf/txt"
-fi
+make text
+mkdir -p "${RAPIDS_DOCS_DIR}/cudf/txt"
+mv build/text/* "${RAPIDS_DOCS_DIR}/cudf/txt"
 popd
 
 rapids-logger "Build dask-cuDF Sphinx docs"
@@ -58,11 +56,9 @@ pushd docs/dask_cudf
 make dirhtml
 mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/html"
 mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/dask-cudf/html"
-if [[ "${RAPIDS_BUILD_TYPE}" != "pull-request" ]]; then
-  make text
-  mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/txt"
-  mv build/text/* "${RAPIDS_DOCS_DIR}/dask-cudf/txt"
-fi
+make text
+mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/txt"
+mv build/text/* "${RAPIDS_DOCS_DIR}/dask-cudf/txt"
 popd
 
 rapids-upload-docs
diff --git a/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst b/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst
index f94a5ddb403..a835673dee4 100644
--- a/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst
+++ b/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst
@@ -2,4 +2,4 @@ Io Readers
 ==========
 
 .. doxygengroup:: io_readers
-   :members:
+   :desc-only:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst
index f2f320bd0e4..ae5d78fb1a1 100644
--- a/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst
@@ -2,4 +2,4 @@ Strings Convert
 ===============
 
 .. doxygengroup:: strings_convert
-   :members:
+   :desc-only:

From dc829b8372487615b74494a19c63d43cdbdb0d79 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 5 Jun 2024 10:10:11 -0400
Subject: [PATCH 272/272] Update Changelog [skip ci]

---
 CHANGELOG.md | 306 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 306 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7ecad2c9c39..a5efe4eb9e5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,309 @@
+# cudf 24.06.00 (5 Jun 2024)
+
+## 🚨 Breaking Changes
+
+- Deprecate `Groupby.collect` ([#15808](https://github.com/rapidsai/cudf/pull/15808)) [@galipremsagar](https://github.com/galipremsagar)
+- Raise FileNotFoundError when a literal JSON string that looks like a json filename is passed ([#15806](https://github.com/rapidsai/cudf/pull/15806)) [@lithomas1](https://github.com/lithomas1)
+- Support filtered I/O in `chunked_parquet_reader` and simplify the use of `parquet_reader_options` ([#15764](https://github.com/rapidsai/cudf/pull/15764)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Raise errors for unsupported operations on certain types ([#15712](https://github.com/rapidsai/cudf/pull/15712)) [@galipremsagar](https://github.com/galipremsagar)
+- Support `DurationType` in cudf parquet reader via `arrow:schema` ([#15617](https://github.com/rapidsai/cudf/pull/15617)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Remove protobuf and use parsed ORC statistics from libcudf ([#15564](https://github.com/rapidsai/cudf/pull/15564)) [@bdice](https://github.com/bdice)
+- Remove legacy JSON reader from Python ([#15538](https://github.com/rapidsai/cudf/pull/15538)) [@bdice](https://github.com/bdice)
+- Removing all batching code from parquet writer ([#15528](https://github.com/rapidsai/cudf/pull/15528)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Convert libcudf resource parameters to rmm::device_async_resource_ref ([#15507](https://github.com/rapidsai/cudf/pull/15507)) [@harrism](https://github.com/harrism)
+- Remove deprecated strings offsets_begin ([#15454](https://github.com/rapidsai/cudf/pull/15454)) [@davidwendt](https://github.com/davidwendt)
+- Floating &lt;--&gt; fixed-point conversion must now be called explicitly ([#15438](https://github.com/rapidsai/cudf/pull/15438)) [@pmattione-nvidia](https://github.com/pmattione-nvidia)
+- Bind `read_parquet_metadata` API to libcudf instead of pyarrow and extract `RowGroup` information ([#15398](https://github.com/rapidsai/cudf/pull/15398)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Remove deprecated hash() and spark_murmurhash3_x86_32() ([#15375](https://github.com/rapidsai/cudf/pull/15375)) [@davidwendt](https://github.com/davidwendt)
+- Remove empty elements from exploded character-ngrams output ([#15371](https://github.com/rapidsai/cudf/pull/15371)) [@davidwendt](https://github.com/davidwendt)
+- [FEA] Performance improvement for mixed left semi/anti join ([#15288](https://github.com/rapidsai/cudf/pull/15288)) [@tgujar](https://github.com/tgujar)
+- Align date_range defaults with pandas, support tz ([#15139](https://github.com/rapidsai/cudf/pull/15139)) [@mroeschke](https://github.com/mroeschke)
+
+## 🐛 Bug Fixes
+
+- Revert &quot;Fix docs for IO readers and strings_convert&quot; ([#15872](https://github.com/rapidsai/cudf/pull/15872)) [@vyasr](https://github.com/vyasr)
+- Remove problematic call of index setter to unblock dask-cuda CI ([#15844](https://github.com/rapidsai/cudf/pull/15844)) [@charlesbluca](https://github.com/charlesbluca)
+- Use rapids_cpm_nvtx3 to get same nvtx3 target state as rmm ([#15840](https://github.com/rapidsai/cudf/pull/15840)) [@robertmaynard](https://github.com/robertmaynard)
+- Return boolean from config_host_memory_resource instead of throwing ([#15815](https://github.com/rapidsai/cudf/pull/15815)) [@abellina](https://github.com/abellina)
+- Add temporary dask-cudf workaround for categorical sorting ([#15801](https://github.com/rapidsai/cudf/pull/15801)) [@rjzamora](https://github.com/rjzamora)
+- Fix row group alignment in ORC writer ([#15789](https://github.com/rapidsai/cudf/pull/15789)) [@vuule](https://github.com/vuule)
+- Raise error when sorting by categorical column in dask-cudf ([#15788](https://github.com/rapidsai/cudf/pull/15788)) [@rjzamora](https://github.com/rjzamora)
+- Upgrade `arrow` to 16.1 ([#15787](https://github.com/rapidsai/cudf/pull/15787)) [@galipremsagar](https://github.com/galipremsagar)
+- Add support for `PandasArray` for `pandas&lt;2.1.0` ([#15786](https://github.com/rapidsai/cudf/pull/15786)) [@galipremsagar](https://github.com/galipremsagar)
+- Limit runtime dependency to `libarrow&gt;=16.0.0,&lt;16.1.0a0` ([#15782](https://github.com/rapidsai/cudf/pull/15782)) [@pentschev](https://github.com/pentschev)
+- Fix cat.as_ordered not propogating correct size ([#15780](https://github.com/rapidsai/cudf/pull/15780)) [@mroeschke](https://github.com/mroeschke)
+- Handle mixed-like homogeneous types in `isin` ([#15771](https://github.com/rapidsai/cudf/pull/15771)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix id_vars and value_vars not accepting string scalars in melt ([#15765](https://github.com/rapidsai/cudf/pull/15765)) [@mroeschke](https://github.com/mroeschke)
+- Fix `DatetimeIndex.loc` for all types of ordering cases ([#15761](https://github.com/rapidsai/cudf/pull/15761)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix arrow versioning logic ([#15755](https://github.com/rapidsai/cudf/pull/15755)) [@vyasr](https://github.com/vyasr)
+- Avoid running sanitizer on Java test designed to cause an error ([#15753](https://github.com/rapidsai/cudf/pull/15753)) [@jlowe](https://github.com/jlowe)
+- Handle empty dataframe object with index present in setitem of `loc` ([#15752](https://github.com/rapidsai/cudf/pull/15752)) [@galipremsagar](https://github.com/galipremsagar)
+- Eliminate circular reference in DataFrame/Series.iloc/loc ([#15749](https://github.com/rapidsai/cudf/pull/15749)) [@mroeschke](https://github.com/mroeschke)
+- Cap the absolute row index per pass in parquet chunked reader. ([#15735](https://github.com/rapidsai/cudf/pull/15735)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Fix `Index.repeat` for `datetime64` types ([#15722](https://github.com/rapidsai/cudf/pull/15722)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix multibyte check for case convert for large strings ([#15721](https://github.com/rapidsai/cudf/pull/15721)) [@davidwendt](https://github.com/davidwendt)
+- Fix `get_loc` to properly fetch results from an index that is in decreasing order ([#15719](https://github.com/rapidsai/cudf/pull/15719)) [@galipremsagar](https://github.com/galipremsagar)
+- Return same type as the original index for `.loc` operations ([#15717](https://github.com/rapidsai/cudf/pull/15717)) [@galipremsagar](https://github.com/galipremsagar)
+- Correct static builds + static arrow ([#15715](https://github.com/rapidsai/cudf/pull/15715)) [@robertmaynard](https://github.com/robertmaynard)
+- Raise errors for unsupported operations on certain types ([#15712](https://github.com/rapidsai/cudf/pull/15712)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix ColumnAccessor caching of nrows if empty previously ([#15710](https://github.com/rapidsai/cudf/pull/15710)) [@mroeschke](https://github.com/mroeschke)
+- Allow `None` when `nan_as_null=False` in column constructor ([#15709](https://github.com/rapidsai/cudf/pull/15709)) [@galipremsagar](https://github.com/galipremsagar)
+- Refine `CudaTest.testCudaException` in case throwing wrong type of CudaError under aarch64 ([#15706](https://github.com/rapidsai/cudf/pull/15706)) [@sperlingxx](https://github.com/sperlingxx)
+- Fix maxima of categorical column ([#15701](https://github.com/rapidsai/cudf/pull/15701)) [@rjzamora](https://github.com/rjzamora)
+- Add proxy for inplace operations in `cudf.pandas` ([#15695](https://github.com/rapidsai/cudf/pull/15695)) [@galipremsagar](https://github.com/galipremsagar)
+- Make `nan_as_null` behavior consistent across all APIs ([#15692](https://github.com/rapidsai/cudf/pull/15692)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix CI s3 api command to fetch latest results ([#15687](https://github.com/rapidsai/cudf/pull/15687)) [@galipremsagar](https://github.com/galipremsagar)
+- Add `NumpyExtensionArray` proxy type in `cudf.pandas` ([#15686](https://github.com/rapidsai/cudf/pull/15686)) [@galipremsagar](https://github.com/galipremsagar)
+- Properly implement binaryops for proxy types ([#15684](https://github.com/rapidsai/cudf/pull/15684)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix copy assignment and the comparison operator of `rmm_host_allocator` ([#15677](https://github.com/rapidsai/cudf/pull/15677)) [@vuule](https://github.com/vuule)
+- Fix multi-source reading in JSON byte range reader ([#15671](https://github.com/rapidsai/cudf/pull/15671)) [@shrshi](https://github.com/shrshi)
+- Return `int64` when pandas compatible mode is turned on for `get_indexer` ([#15659](https://github.com/rapidsai/cudf/pull/15659)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix Index contains for error validations and float vs int comparisons ([#15657](https://github.com/rapidsai/cudf/pull/15657)) [@galipremsagar](https://github.com/galipremsagar)
+- Preserve sub-second data for time scalars in column construction ([#15655](https://github.com/rapidsai/cudf/pull/15655)) [@galipremsagar](https://github.com/galipremsagar)
+- Check row limit size in cudf::strings::join_strings ([#15643](https://github.com/rapidsai/cudf/pull/15643)) [@davidwendt](https://github.com/davidwendt)
+- Enable sorting on column with nulls using query-planning ([#15639](https://github.com/rapidsai/cudf/pull/15639)) [@rjzamora](https://github.com/rjzamora)
+- Fix operator precedence problem in Parquet reader ([#15638](https://github.com/rapidsai/cudf/pull/15638)) [@etseidl](https://github.com/etseidl)
+- Fix decoding of dictionary encoded FIXED_LEN_BYTE_ARRAY data in Parquet reader ([#15601](https://github.com/rapidsai/cudf/pull/15601)) [@etseidl](https://github.com/etseidl)
+- Fix debug warnings/errors in from_arrow_device_test.cpp ([#15596](https://github.com/rapidsai/cudf/pull/15596)) [@davidwendt](https://github.com/davidwendt)
+- Add &quot;collect&quot; aggregation support to dask-cudf ([#15593](https://github.com/rapidsai/cudf/pull/15593)) [@rjzamora](https://github.com/rjzamora)
+- Fix categorical-accessor support and testing in dask-cudf ([#15591](https://github.com/rapidsai/cudf/pull/15591)) [@rjzamora](https://github.com/rjzamora)
+- Disable compute-sanitizer usage in CI tests with CUDA&lt;11.6 ([#15584](https://github.com/rapidsai/cudf/pull/15584)) [@davidwendt](https://github.com/davidwendt)
+- Preserve RangeIndex.step in to_arrow/from_arrow ([#15581](https://github.com/rapidsai/cudf/pull/15581)) [@mroeschke](https://github.com/mroeschke)
+- Ignore new cupy warning ([#15574](https://github.com/rapidsai/cudf/pull/15574)) [@vyasr](https://github.com/vyasr)
+- Add cuda-sanitizer-api dependency for test-cpp matrix 11.4 ([#15573](https://github.com/rapidsai/cudf/pull/15573)) [@davidwendt](https://github.com/davidwendt)
+- Allow apply udf to reference global modules in cudf.pandas ([#15569](https://github.com/rapidsai/cudf/pull/15569)) [@mroeschke](https://github.com/mroeschke)
+- Fix deprecation warnings for json legacy reader ([#15563](https://github.com/rapidsai/cudf/pull/15563)) [@davidwendt](https://github.com/davidwendt)
+- Fix millisecond resampling in cudf Python ([#15560](https://github.com/rapidsai/cudf/pull/15560)) [@mroeschke](https://github.com/mroeschke)
+- Rename JSON_READER_OPTION to JSON_READER_OPTION_NVBENCH. ([#15553](https://github.com/rapidsai/cudf/pull/15553)) [@bdice](https://github.com/bdice)
+- Fix a JNI bug in JSON parsing fixup ([#15550](https://github.com/rapidsai/cudf/pull/15550)) [@revans2](https://github.com/revans2)
+- Remove conda channel setup from wheel CI image script. ([#15539](https://github.com/rapidsai/cudf/pull/15539)) [@bdice](https://github.com/bdice)
+- cudf.pandas: Series dt accessor is CombinedDatetimelikeProperties ([#15523](https://github.com/rapidsai/cudf/pull/15523)) [@wence-](https://github.com/wence-)
+- Fix for some compiler warnings in parquet/page_decode.cuh ([#15518](https://github.com/rapidsai/cudf/pull/15518)) [@etseidl](https://github.com/etseidl)
+- Fix exponent overflow in strings-to-double conversion ([#15517](https://github.com/rapidsai/cudf/pull/15517)) [@davidwendt](https://github.com/davidwendt)
+- nanoarrow uses package override for proper pinned versions generation ([#15515](https://github.com/rapidsai/cudf/pull/15515)) [@robertmaynard](https://github.com/robertmaynard)
+- Remove index name overrides in dask-cudf pyarrow table dispatch ([#15514](https://github.com/rapidsai/cudf/pull/15514)) [@charlesbluca](https://github.com/charlesbluca)
+- Fix async synchronization issues in json_column.cu ([#15497](https://github.com/rapidsai/cudf/pull/15497)) [@karthikeyann](https://github.com/karthikeyann)
+- Add new patch to hide more CCCL APIs ([#15493](https://github.com/rapidsai/cudf/pull/15493)) [@vyasr](https://github.com/vyasr)
+- Make improvements in pandas-test reporting ([#15485](https://github.com/rapidsai/cudf/pull/15485)) [@galipremsagar](https://github.com/galipremsagar)
+- Fixed page data truncation in parquet writer under certain conditions. ([#15474](https://github.com/rapidsai/cudf/pull/15474)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Only use data_type constructor with scale for decimal types ([#15472](https://github.com/rapidsai/cudf/pull/15472)) [@wence-](https://github.com/wence-)
+- Avoid &quot;p2p&quot; shuffle as a default when `dask_cudf` is imported ([#15469](https://github.com/rapidsai/cudf/pull/15469)) [@rjzamora](https://github.com/rjzamora)
+- Fix debug build errors from to_arrow_device_test.cpp ([#15463](https://github.com/rapidsai/cudf/pull/15463)) [@davidwendt](https://github.com/davidwendt)
+- Fix base_normalator::integer_sizeof_fn integer dispatch ([#15457](https://github.com/rapidsai/cudf/pull/15457)) [@davidwendt](https://github.com/davidwendt)
+- Allow consumers of static builds to find nanoarrow ([#15456](https://github.com/rapidsai/cudf/pull/15456)) [@robertmaynard](https://github.com/robertmaynard)
+- Allow jit compilation when using a splayed CUDA toolkit ([#15451](https://github.com/rapidsai/cudf/pull/15451)) [@robertmaynard](https://github.com/robertmaynard)
+- Handle case of scan aggregation in groupby-transform ([#15450](https://github.com/rapidsai/cudf/pull/15450)) [@wence-](https://github.com/wence-)
+- Test static builds in CI and fix nanoarrow configure ([#15437](https://github.com/rapidsai/cudf/pull/15437)) [@vyasr](https://github.com/vyasr)
+- Fixes potential race in JSON parser when parsing JSON lines format and when recovering from invalid lines ([#15419](https://github.com/rapidsai/cudf/pull/15419)) [@elstehle](https://github.com/elstehle)
+- Fix errors in chunked ORC writer when no tables were (successfully) written ([#15393](https://github.com/rapidsai/cudf/pull/15393)) [@vuule](https://github.com/vuule)
+- Support implicit array conversion with query-planning enabled ([#15378](https://github.com/rapidsai/cudf/pull/15378)) [@rjzamora](https://github.com/rjzamora)
+- Fix arrow-based round trip of empty dataframes ([#15373](https://github.com/rapidsai/cudf/pull/15373)) [@wence-](https://github.com/wence-)
+- Remove empty elements from exploded character-ngrams output ([#15371](https://github.com/rapidsai/cudf/pull/15371)) [@davidwendt](https://github.com/davidwendt)
+- Remove boundscheck=False setting in cython files ([#15362](https://github.com/rapidsai/cudf/pull/15362)) [@wence-](https://github.com/wence-)
+- Patch dask-expr `var` logic in dask-cudf ([#15347](https://github.com/rapidsai/cudf/pull/15347)) [@rjzamora](https://github.com/rjzamora)
+- Fix for logical and syntactical errors in libcudf c++ examples ([#15346](https://github.com/rapidsai/cudf/pull/15346)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Disable dask-expr in docs builds. ([#15343](https://github.com/rapidsai/cudf/pull/15343)) [@bdice](https://github.com/bdice)
+- Apply the cuFile error work around to data_sink as well ([#15335](https://github.com/rapidsai/cudf/pull/15335)) [@vuule](https://github.com/vuule)
+- Fix parquet predicate filtering with column projection ([#15113](https://github.com/rapidsai/cudf/pull/15113)) [@karthikeyann](https://github.com/karthikeyann)
+- Check column type equality, handling nested types correctly. ([#14531](https://github.com/rapidsai/cudf/pull/14531)) [@bdice](https://github.com/bdice)
+
+## 📖 Documentation
+
+- Fix docs for IO readers and strings_convert ([#15842](https://github.com/rapidsai/cudf/pull/15842)) [@bdice](https://github.com/bdice)
+- Update cudf.pandas docs for GA ([#15744](https://github.com/rapidsai/cudf/pull/15744)) [@beckernick](https://github.com/beckernick)
+- Add contributing warning about circular imports ([#15691](https://github.com/rapidsai/cudf/pull/15691)) [@er-eis](https://github.com/er-eis)
+- Update libcudf developer guide for strings offsets column ([#15661](https://github.com/rapidsai/cudf/pull/15661)) [@davidwendt](https://github.com/davidwendt)
+- Update developer guide with device_async_resource_ref guidelines ([#15562](https://github.com/rapidsai/cudf/pull/15562)) [@harrism](https://github.com/harrism)
+- DOC: add pandas intersphinx mapping ([#15531](https://github.com/rapidsai/cudf/pull/15531)) [@raybellwaves](https://github.com/raybellwaves)
+- rm-dup-doc in frame.py ([#15530](https://github.com/rapidsai/cudf/pull/15530)) [@raybellwaves](https://github.com/raybellwaves)
+- Update CONTRIBUTING.md to use latest cuda env ([#15467](https://github.com/rapidsai/cudf/pull/15467)) [@raybellwaves](https://github.com/raybellwaves)
+- Doc: interleave columns pandas compat ([#15383](https://github.com/rapidsai/cudf/pull/15383)) [@raybellwaves](https://github.com/raybellwaves)
+- Simplified README Examples ([#15338](https://github.com/rapidsai/cudf/pull/15338)) [@wkaisertexas](https://github.com/wkaisertexas)
+- Add debug tips section to libcudf developer guide ([#15329](https://github.com/rapidsai/cudf/pull/15329)) [@davidwendt](https://github.com/davidwendt)
+- Fix and clarify notes on result ordering ([#13255](https://github.com/rapidsai/cudf/pull/13255)) [@shwina](https://github.com/shwina)
+
+## 🚀 New Features
+
+- Add JNI bindings for zstd compression of NVCOMP. ([#15729](https://github.com/rapidsai/cudf/pull/15729)) [@firestarman](https://github.com/firestarman)
+- Fix spaces around CSV quoted strings ([#15727](https://github.com/rapidsai/cudf/pull/15727)) [@thabetx](https://github.com/thabetx)
+- Add default pinned pool that falls back to new pinned allocations ([#15665](https://github.com/rapidsai/cudf/pull/15665)) [@vuule](https://github.com/vuule)
+- Overhaul ops-codeowners coverage ([#15660](https://github.com/rapidsai/cudf/pull/15660)) [@raydouglass](https://github.com/raydouglass)
+- Concatenate dictionary of objects along axis=1 ([#15623](https://github.com/rapidsai/cudf/pull/15623)) [@er-eis](https://github.com/er-eis)
+- Construct `pylibcudf` columns from objects supporting `__cuda_array_interface__` ([#15615](https://github.com/rapidsai/cudf/pull/15615)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Expose some Parquet per-column configuration options via the python API ([#15613](https://github.com/rapidsai/cudf/pull/15613)) [@etseidl](https://github.com/etseidl)
+- Migrate string `find` operations to `pylibcudf` ([#15604](https://github.com/rapidsai/cudf/pull/15604)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Round trip FIXED_LEN_BYTE_ARRAY data properly in Parquet writer ([#15600](https://github.com/rapidsai/cudf/pull/15600)) [@etseidl](https://github.com/etseidl)
+- Reading multi-line JSON in string columns using runtime configurable delimiter ([#15556](https://github.com/rapidsai/cudf/pull/15556)) [@shrshi](https://github.com/shrshi)
+- Remove public gtest dependency from libcudf conda package ([#15534](https://github.com/rapidsai/cudf/pull/15534)) [@robertmaynard](https://github.com/robertmaynard)
+- Fea/move to latest nanoarrow ([#15526](https://github.com/rapidsai/cudf/pull/15526)) [@robertmaynard](https://github.com/robertmaynard)
+- Migrate string `case` operations to `pylibcudf` ([#15489](https://github.com/rapidsai/cudf/pull/15489)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add Parquet encoding statistics to column chunk metadata ([#15452](https://github.com/rapidsai/cudf/pull/15452)) [@etseidl](https://github.com/etseidl)
+- Implement JNI for chunked ORC reader ([#15446](https://github.com/rapidsai/cudf/pull/15446)) [@ttnghia](https://github.com/ttnghia)
+- Add some missing optional fields to the Parquet RowGroup metadata ([#15421](https://github.com/rapidsai/cudf/pull/15421)) [@etseidl](https://github.com/etseidl)
+- Adding parquet transcoding example ([#15420](https://github.com/rapidsai/cudf/pull/15420)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Add fields to Parquet Statistics structure that were added in parquet-format 2.10 ([#15412](https://github.com/rapidsai/cudf/pull/15412)) [@etseidl](https://github.com/etseidl)
+- Add option to Parquet writer to skip compressing individual columns ([#15411](https://github.com/rapidsai/cudf/pull/15411)) [@etseidl](https://github.com/etseidl)
+- Add BYTE_STREAM_SPLIT support to Parquet ([#15311](https://github.com/rapidsai/cudf/pull/15311)) [@etseidl](https://github.com/etseidl)
+- Introduce benchmark suite for JSON reader options ([#15124](https://github.com/rapidsai/cudf/pull/15124)) [@shrshi](https://github.com/shrshi)
+- Implement ORC chunked reader ([#15094](https://github.com/rapidsai/cudf/pull/15094)) [@ttnghia](https://github.com/ttnghia)
+- Extend cudf devcontainers to specify jitify2 kernel cache ([#15068](https://github.com/rapidsai/cudf/pull/15068)) [@robertmaynard](https://github.com/robertmaynard)
+- Add `to_arrow_device` function to cudf interop using nanoarrow ([#15047](https://github.com/rapidsai/cudf/pull/15047)) [@zeroshade](https://github.com/zeroshade)
+- Add JSON option to prune columns ([#14996](https://github.com/rapidsai/cudf/pull/14996)) [@karthikeyann](https://github.com/karthikeyann)
+
+## 🛠️ Improvements
+
+- Deprecate `Groupby.collect` ([#15808](https://github.com/rapidsai/cudf/pull/15808)) [@galipremsagar](https://github.com/galipremsagar)
+- Raise FileNotFoundError when a literal JSON string that looks like a json filename is passed ([#15806](https://github.com/rapidsai/cudf/pull/15806)) [@lithomas1](https://github.com/lithomas1)
+- Deprecate `divisions=&#39;quantile&#39;` support in `set_index` ([#15804](https://github.com/rapidsai/cudf/pull/15804)) [@rjzamora](https://github.com/rjzamora)
+- Improve performance of Series.to_numpy/to_cupy ([#15792](https://github.com/rapidsai/cudf/pull/15792)) [@mroeschke](https://github.com/mroeschke)
+- Access `self.index` instead of `self._index` where possible ([#15781](https://github.com/rapidsai/cudf/pull/15781)) [@mroeschke](https://github.com/mroeschke)
+- Support filtered I/O in `chunked_parquet_reader` and simplify the use of `parquet_reader_options` ([#15764](https://github.com/rapidsai/cudf/pull/15764)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Avoid index-to-column conversion in some DataFrame ops ([#15763](https://github.com/rapidsai/cudf/pull/15763)) [@mroeschke](https://github.com/mroeschke)
+- Fix `chunked_parquet_reader` behavior when input has no more rows to read ([#15757](https://github.com/rapidsai/cudf/pull/15757)) [@mhaseeb123](https://github.com/mhaseeb123)
+- [JNI] Expose java API for cudf::io::config_host_memory_resource ([#15745](https://github.com/rapidsai/cudf/pull/15745)) [@abellina](https://github.com/abellina)
+- Migrate all cpp pxd files into pylibcudf ([#15740](https://github.com/rapidsai/cudf/pull/15740)) [@vyasr](https://github.com/vyasr)
+- Validate and materialize iterators earlier in as_column ([#15739](https://github.com/rapidsai/cudf/pull/15739)) [@mroeschke](https://github.com/mroeschke)
+- Push some as_column arrow logic to ColumnBase.from_arrow ([#15738](https://github.com/rapidsai/cudf/pull/15738)) [@mroeschke](https://github.com/mroeschke)
+- Expose stream parameter in public reduction APIs ([#15737](https://github.com/rapidsai/cudf/pull/15737)) [@srinivasyadav18](https://github.com/srinivasyadav18)
+- remove unnecessary &#39;setuptools&#39; host dependency, simplify dependencies.yaml ([#15736](https://github.com/rapidsai/cudf/pull/15736)) [@jameslamb](https://github.com/jameslamb)
+- Defer to C++ equality and hashing for pylibcudf DataType and Aggregation objects ([#15732](https://github.com/rapidsai/cudf/pull/15732)) [@wence-](https://github.com/wence-)
+- Implement null-aware NOT_EQUALS binop ([#15731](https://github.com/rapidsai/cudf/pull/15731)) [@wence-](https://github.com/wence-)
+- Fix split-record result list column offset type ([#15707](https://github.com/rapidsai/cudf/pull/15707)) [@davidwendt](https://github.com/davidwendt)
+- Upgrade `arrow` to `16` ([#15703](https://github.com/rapidsai/cudf/pull/15703)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove experimental namespace from make_strings_children ([#15702](https://github.com/rapidsai/cudf/pull/15702)) [@davidwendt](https://github.com/davidwendt)
+- Rework get_json_object benchmark to use nvbench ([#15698](https://github.com/rapidsai/cudf/pull/15698)) [@davidwendt](https://github.com/davidwendt)
+- Rework some python tests of Parquet delta encodings ([#15693](https://github.com/rapidsai/cudf/pull/15693)) [@etseidl](https://github.com/etseidl)
+- Skeleton cudf polars package ([#15688](https://github.com/rapidsai/cudf/pull/15688)) [@wence-](https://github.com/wence-)
+- Upgrade pre commit hooks ([#15685](https://github.com/rapidsai/cudf/pull/15685)) [@wence-](https://github.com/wence-)
+- Allow `fillna` to validate for `CategoricalColumn.fillna` ([#15683](https://github.com/rapidsai/cudf/pull/15683)) [@galipremsagar](https://github.com/galipremsagar)
+- Misc Column cleanups ([#15682](https://github.com/rapidsai/cudf/pull/15682)) [@mroeschke](https://github.com/mroeschke)
+- Reducing runtime of JSON reader options benchmark ([#15681](https://github.com/rapidsai/cudf/pull/15681)) [@shrshi](https://github.com/shrshi)
+- Add `Timestamp` and `Timedelta` proxy types ([#15680](https://github.com/rapidsai/cudf/pull/15680)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove host_parse_nested_json. ([#15674](https://github.com/rapidsai/cudf/pull/15674)) [@bdice](https://github.com/bdice)
+- Reduce runtime for ParquetChunkedReaderInputLimitTest gtests ([#15672](https://github.com/rapidsai/cudf/pull/15672)) [@davidwendt](https://github.com/davidwendt)
+- Add large-strings gtest for cudf::interleave_columns ([#15669](https://github.com/rapidsai/cudf/pull/15669)) [@davidwendt](https://github.com/davidwendt)
+- Use experimental make_strings_children for multi-replace_re ([#15667](https://github.com/rapidsai/cudf/pull/15667)) [@davidwendt](https://github.com/davidwendt)
+- Enabled `Holiday` types in `cudf.pandas` ([#15664](https://github.com/rapidsai/cudf/pull/15664)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove obsolete `XFAIL` markers for query-planning ([#15662](https://github.com/rapidsai/cudf/pull/15662)) [@rjzamora](https://github.com/rjzamora)
+- Clean up join benchmarks ([#15644](https://github.com/rapidsai/cudf/pull/15644)) [@PointKernel](https://github.com/PointKernel)
+- Enable warnings as errors in custreamz ([#15642](https://github.com/rapidsai/cudf/pull/15642)) [@mroeschke](https://github.com/mroeschke)
+- Improve distinct join with set `retrieve` ([#15636](https://github.com/rapidsai/cudf/pull/15636)) [@PointKernel](https://github.com/PointKernel)
+- Fix -Werror=type-limits. ([#15635](https://github.com/rapidsai/cudf/pull/15635)) [@bdice](https://github.com/bdice)
+- Enable FutureWarnings/DeprecationWarnings as errors for dask_cudf ([#15634](https://github.com/rapidsai/cudf/pull/15634)) [@mroeschke](https://github.com/mroeschke)
+- Remove NVBench SHA override. ([#15633](https://github.com/rapidsai/cudf/pull/15633)) [@alliepiper](https://github.com/alliepiper)
+- Add support for large string columns to Parquet reader and writer ([#15632](https://github.com/rapidsai/cudf/pull/15632)) [@etseidl](https://github.com/etseidl)
+- Large strings support in MD5 and SHA hashers ([#15631](https://github.com/rapidsai/cudf/pull/15631)) [@davidwendt](https://github.com/davidwendt)
+- Fix make_offsets_child_column usage in cudf::strings::detail::shift ([#15630](https://github.com/rapidsai/cudf/pull/15630)) [@davidwendt](https://github.com/davidwendt)
+- Use experimental make_strings_children for strings convert ([#15629](https://github.com/rapidsai/cudf/pull/15629)) [@davidwendt](https://github.com/davidwendt)
+- Forward-merge branch-24.04 to branch-24.06 ([#15627](https://github.com/rapidsai/cudf/pull/15627)) [@bdice](https://github.com/bdice)
+- Avoid accessing attributes via `_column` if not needed ([#15624](https://github.com/rapidsai/cudf/pull/15624)) [@mroeschke](https://github.com/mroeschke)
+- Make ColumnBase.__cuda_array_interface__ opt out instead of opt in ([#15622](https://github.com/rapidsai/cudf/pull/15622)) [@mroeschke](https://github.com/mroeschke)
+- Large strings support for cudf::gather ([#15621](https://github.com/rapidsai/cudf/pull/15621)) [@davidwendt](https://github.com/davidwendt)
+- Remove jni-docker-build workflow ([#15619](https://github.com/rapidsai/cudf/pull/15619)) [@bdice](https://github.com/bdice)
+- Support `DurationType` in cudf parquet reader via `arrow:schema` ([#15617](https://github.com/rapidsai/cudf/pull/15617)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Drop Centos7 support ([#15608](https://github.com/rapidsai/cudf/pull/15608)) [@NvTimLiu](https://github.com/NvTimLiu)
+- Use experimental make_strings_children for json/csv writers ([#15599](https://github.com/rapidsai/cudf/pull/15599)) [@davidwendt](https://github.com/davidwendt)
+- Use experimental make_strings_children for strings join/url_encode/slice ([#15598](https://github.com/rapidsai/cudf/pull/15598)) [@davidwendt](https://github.com/davidwendt)
+- Use experimental make_strings_children in nvtext APIs ([#15595](https://github.com/rapidsai/cudf/pull/15595)) [@davidwendt](https://github.com/davidwendt)
+- Migrate to `{{ stdlib(&quot;c&quot;) }}` ([#15594](https://github.com/rapidsai/cudf/pull/15594)) [@hcho3](https://github.com/hcho3)
+- Deprecate `to/from_dask_dataframe` APIs in dask-cudf ([#15592](https://github.com/rapidsai/cudf/pull/15592)) [@rjzamora](https://github.com/rjzamora)
+- Minor fixups for future NumPy 2 compatibility ([#15590](https://github.com/rapidsai/cudf/pull/15590)) [@seberg](https://github.com/seberg)
+- Delay materializing RangeIndex in .reset_index ([#15588](https://github.com/rapidsai/cudf/pull/15588)) [@mroeschke](https://github.com/mroeschke)
+- Use experimental make_strings_children for capitalize/case/pad functions ([#15587](https://github.com/rapidsai/cudf/pull/15587)) [@davidwendt](https://github.com/davidwendt)
+- Use experimental make_strings_children for strings replace/filter/translate ([#15586](https://github.com/rapidsai/cudf/pull/15586)) [@davidwendt](https://github.com/davidwendt)
+- Add multithreaded parquet reader benchmarks. ([#15585](https://github.com/rapidsai/cudf/pull/15585)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Don&#39;t materialize column during RangeIndex methods ([#15582](https://github.com/rapidsai/cudf/pull/15582)) [@mroeschke](https://github.com/mroeschke)
+- Improve performance for cudf::strings::count_re ([#15578](https://github.com/rapidsai/cudf/pull/15578)) [@davidwendt](https://github.com/davidwendt)
+- Replace RangeIndex._start/_stop/_step with _range ([#15576](https://github.com/rapidsai/cudf/pull/15576)) [@mroeschke](https://github.com/mroeschke)
+- add --rm and --name to devcontainer run args ([#15572](https://github.com/rapidsai/cudf/pull/15572)) [@trxcllnt](https://github.com/trxcllnt)
+- Change the default dictionary policy in Parquet writer from `ALWAYS` to `ADAPTIVE` ([#15570](https://github.com/rapidsai/cudf/pull/15570)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Rename experimental JSON tests. ([#15568](https://github.com/rapidsai/cudf/pull/15568)) [@bdice](https://github.com/bdice)
+- Refactor JNI native dependency loading to allow returning of library path ([#15566](https://github.com/rapidsai/cudf/pull/15566)) [@jlowe](https://github.com/jlowe)
+- Remove protobuf and use parsed ORC statistics from libcudf ([#15564](https://github.com/rapidsai/cudf/pull/15564)) [@bdice](https://github.com/bdice)
+- Deprecate legacy JSON reader options. ([#15558](https://github.com/rapidsai/cudf/pull/15558)) [@bdice](https://github.com/bdice)
+- Use same .clang-format in cuDF JNI ([#15557](https://github.com/rapidsai/cudf/pull/15557)) [@bdice](https://github.com/bdice)
+- Large strings support for cudf::fill ([#15555](https://github.com/rapidsai/cudf/pull/15555)) [@davidwendt](https://github.com/davidwendt)
+- Upgrade upper bound pinning to `pandas-2.2.2` ([#15554](https://github.com/rapidsai/cudf/pull/15554)) [@galipremsagar](https://github.com/galipremsagar)
+- Work around issues with cccl main ([#15552](https://github.com/rapidsai/cudf/pull/15552)) [@miscco](https://github.com/miscco)
+- Enable pandas plotting unit tests for cudf.pandas ([#15547](https://github.com/rapidsai/cudf/pull/15547)) [@mroeschke](https://github.com/mroeschke)
+- Move timezone conversion logic to `DatetimeColumn` ([#15545](https://github.com/rapidsai/cudf/pull/15545)) [@mroeschke](https://github.com/mroeschke)
+- Large strings support for cudf::interleave_columns ([#15544](https://github.com/rapidsai/cudf/pull/15544)) [@davidwendt](https://github.com/davidwendt)
+- [skip ci] Switch back to 24.06 branch for pandas tests ([#15543](https://github.com/rapidsai/cudf/pull/15543)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove checks dependency from static-configure test job. ([#15542](https://github.com/rapidsai/cudf/pull/15542)) [@bdice](https://github.com/bdice)
+- Remove legacy JSON reader from Python ([#15538](https://github.com/rapidsai/cudf/pull/15538)) [@bdice](https://github.com/bdice)
+- Enable more ignored pandas unit tests for cudf.pandas ([#15535](https://github.com/rapidsai/cudf/pull/15535)) [@mroeschke](https://github.com/mroeschke)
+- Large strings support for cudf::clamp ([#15533](https://github.com/rapidsai/cudf/pull/15533)) [@davidwendt](https://github.com/davidwendt)
+- Remove version hard-coding ([#15529](https://github.com/rapidsai/cudf/pull/15529)) [@galipremsagar](https://github.com/galipremsagar)
+- Removing all batching code from parquet writer ([#15528](https://github.com/rapidsai/cudf/pull/15528)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Make some private class properties not settable ([#15527](https://github.com/rapidsai/cudf/pull/15527)) [@mroeschke](https://github.com/mroeschke)
+- Large strings support in regex replace APIs ([#15524](https://github.com/rapidsai/cudf/pull/15524)) [@davidwendt](https://github.com/davidwendt)
+- Skip pandas unit tests that crash pytest workers in `cudf.pandas` ([#15521](https://github.com/rapidsai/cudf/pull/15521)) [@mroeschke](https://github.com/mroeschke)
+- Preserve column metadata during more DataFrame operations ([#15519](https://github.com/rapidsai/cudf/pull/15519)) [@mroeschke](https://github.com/mroeschke)
+- Move to pandas-tests to a dedicated workflow file and trigger it from branch.yaml ([#15516](https://github.com/rapidsai/cudf/pull/15516)) [@galipremsagar](https://github.com/galipremsagar)
+- Large strings gtest fixture and utilities ([#15513](https://github.com/rapidsai/cudf/pull/15513)) [@davidwendt](https://github.com/davidwendt)
+- Convert libcudf resource parameters to rmm::device_async_resource_ref ([#15507](https://github.com/rapidsai/cudf/pull/15507)) [@harrism](https://github.com/harrism)
+- Relax protobuf lower bound to 3.20. ([#15506](https://github.com/rapidsai/cudf/pull/15506)) [@bdice](https://github.com/bdice)
+- Clean up index methods ([#15496](https://github.com/rapidsai/cudf/pull/15496)) [@mroeschke](https://github.com/mroeschke)
+- Update strings contains benchmarks to nvbench ([#15495](https://github.com/rapidsai/cudf/pull/15495)) [@davidwendt](https://github.com/davidwendt)
+- Update NVBench fixture to use new hooks, fix pinned memory segfault. ([#15492](https://github.com/rapidsai/cudf/pull/15492)) [@alliepiper](https://github.com/alliepiper)
+- Enable tests/scalar and test/series in cudf.pandas tests ([#15486](https://github.com/rapidsai/cudf/pull/15486)) [@mroeschke](https://github.com/mroeschke)
+- Clean up __cuda_array_interface__ handling in as_column ([#15477](https://github.com/rapidsai/cudf/pull/15477)) [@mroeschke](https://github.com/mroeschke)
+- Avoid .ordered and .categories from being settable in CategoricalColumn and CategoricalDtype ([#15475](https://github.com/rapidsai/cudf/pull/15475)) [@mroeschke](https://github.com/mroeschke)
+- Ignore pandas tests for cudf.pandas that need motoserver ([#15468](https://github.com/rapidsai/cudf/pull/15468)) [@mroeschke](https://github.com/mroeschke)
+- Use cached_property for NumericColumn.nan_count instead of ._nan_count variable ([#15466](https://github.com/rapidsai/cudf/pull/15466)) [@mroeschke](https://github.com/mroeschke)
+- Add to_arrow_device() functions that accept views ([#15465](https://github.com/rapidsai/cudf/pull/15465)) [@davidwendt](https://github.com/davidwendt)
+- Add custom status check workflow ([#15464](https://github.com/rapidsai/cudf/pull/15464)) [@galipremsagar](https://github.com/galipremsagar)
+- Disable pandas 2.x clipboard tests in cudf.pandas tests ([#15462](https://github.com/rapidsai/cudf/pull/15462)) [@mroeschke](https://github.com/mroeschke)
+- Enable tests/strings/test_api.py and tests/io/pytables in cudf.pandas tests ([#15461](https://github.com/rapidsai/cudf/pull/15461)) [@mroeschke](https://github.com/mroeschke)
+- Enable test_parsing in cudf.pandas tests ([#15460](https://github.com/rapidsai/cudf/pull/15460)) [@mroeschke](https://github.com/mroeschke)
+- Add `from_arrow_device` function to cudf interop using nanoarrow ([#15458](https://github.com/rapidsai/cudf/pull/15458)) [@zeroshade](https://github.com/zeroshade)
+- Remove deprecated strings offsets_begin ([#15454](https://github.com/rapidsai/cudf/pull/15454)) [@davidwendt](https://github.com/davidwendt)
+- Enable tests/windows/ in cudf.pandas tests ([#15444](https://github.com/rapidsai/cudf/pull/15444)) [@mroeschke](https://github.com/mroeschke)
+- Enable tests/interchange/test_impl.py in cudf.pandas tests ([#15443](https://github.com/rapidsai/cudf/pull/15443)) [@mroeschke](https://github.com/mroeschke)
+- Enable tests/io/test_user_agent.py in cudf pandas tests ([#15442](https://github.com/rapidsai/cudf/pull/15442)) [@mroeschke](https://github.com/mroeschke)
+- Performance improvement in libcudf case conversion for long strings ([#15441](https://github.com/rapidsai/cudf/pull/15441)) [@davidwendt](https://github.com/davidwendt)
+- Remove prior test skipping in run-pandas-tests with testing 2.2.1 ([#15440](https://github.com/rapidsai/cudf/pull/15440)) [@mroeschke](https://github.com/mroeschke)
+- Support orc and text IO with dask-expr using legacy conversion ([#15439](https://github.com/rapidsai/cudf/pull/15439)) [@rjzamora](https://github.com/rjzamora)
+- Floating &lt;--&gt; fixed-point conversion must now be called explicitly ([#15438](https://github.com/rapidsai/cudf/pull/15438)) [@pmattione-nvidia](https://github.com/pmattione-nvidia)
+- Unify Copy-On-Write and Spilling ([#15436](https://github.com/rapidsai/cudf/pull/15436)) [@madsbk](https://github.com/madsbk)
+- Enable ``dask_cudf`` json and s3 tests with query-planning on ([#15408](https://github.com/rapidsai/cudf/pull/15408)) [@rjzamora](https://github.com/rjzamora)
+- Bump ruff and codespell pre-commit checks ([#15407](https://github.com/rapidsai/cudf/pull/15407)) [@mroeschke](https://github.com/mroeschke)
+- Enable all tests for `arm` arch ([#15402](https://github.com/rapidsai/cudf/pull/15402)) [@galipremsagar](https://github.com/galipremsagar)
+- Bind `read_parquet_metadata` API to libcudf instead of pyarrow and extract `RowGroup` information ([#15398](https://github.com/rapidsai/cudf/pull/15398)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Optimizing multi-source byte range reading in JSON reader ([#15396](https://github.com/rapidsai/cudf/pull/15396)) [@shrshi](https://github.com/shrshi)
+- add correct labels to pandas_function_request.md ([#15381](https://github.com/rapidsai/cudf/pull/15381)) [@raybellwaves](https://github.com/raybellwaves)
+- Remove deprecated hash() and spark_murmurhash3_x86_32() ([#15375](https://github.com/rapidsai/cudf/pull/15375)) [@davidwendt](https://github.com/davidwendt)
+- Large strings support in cudf::merge ([#15374](https://github.com/rapidsai/cudf/pull/15374)) [@davidwendt](https://github.com/davidwendt)
+- Enable test-reporting for pandas pytests in CI ([#15369](https://github.com/rapidsai/cudf/pull/15369)) [@galipremsagar](https://github.com/galipremsagar)
+- Use logical types in Parquet reader ([#15365](https://github.com/rapidsai/cudf/pull/15365)) [@etseidl](https://github.com/etseidl)
+- Add experimental make_strings_children utility ([#15363](https://github.com/rapidsai/cudf/pull/15363)) [@davidwendt](https://github.com/davidwendt)
+- Forward-merge branch-24.04 to branch-24.06 ([#15349](https://github.com/rapidsai/cudf/pull/15349)) [@bdice](https://github.com/bdice)
+- Fix CMake files in libcudf C++ examples to use existing libcudf build if present ([#15348](https://github.com/rapidsai/cudf/pull/15348)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Use ruff pydocstyle over pydocstyle pre-commit hook ([#15345](https://github.com/rapidsai/cudf/pull/15345)) [@mroeschke](https://github.com/mroeschke)
+- Refactor stream mode setup for gtests ([#15337](https://github.com/rapidsai/cudf/pull/15337)) [@davidwendt](https://github.com/davidwendt)
+- Benchmark decimal &lt;--&gt; floating conversions. ([#15334](https://github.com/rapidsai/cudf/pull/15334)) [@pmattione-nvidia](https://github.com/pmattione-nvidia)
+- Avoid duplicate dask-cudf testing ([#15333](https://github.com/rapidsai/cudf/pull/15333)) [@rjzamora](https://github.com/rjzamora)
+- Skip decode steps in Parquet reader when nullable columns have no nulls ([#15332](https://github.com/rapidsai/cudf/pull/15332)) [@etseidl](https://github.com/etseidl)
+- Update udf_cpp to use rapids_cpm_cccl. ([#15331](https://github.com/rapidsai/cudf/pull/15331)) [@bdice](https://github.com/bdice)
+- Forward-merge branch-24.04 into branch-24.06 [skip ci] ([#15330](https://github.com/rapidsai/cudf/pull/15330)) [@rapids-bot[bot]](https://github.com/rapids-bot[bot])
+- Allow ``numeric_only=True`` for simple groupby reductions ([#15326](https://github.com/rapidsai/cudf/pull/15326)) [@rjzamora](https://github.com/rjzamora)
+- Drop CentOS 7 support. ([#15323](https://github.com/rapidsai/cudf/pull/15323)) [@bdice](https://github.com/bdice)
+- Rework cudf::find_and_replace_all to use gather-based make_strings_column ([#15305](https://github.com/rapidsai/cudf/pull/15305)) [@davidwendt](https://github.com/davidwendt)
+- First pass at adding testing for pylibcudf ([#15300](https://github.com/rapidsai/cudf/pull/15300)) [@vyasr](https://github.com/vyasr)
+- [FEA] Performance improvement for mixed left semi/anti join ([#15288](https://github.com/rapidsai/cudf/pull/15288)) [@tgujar](https://github.com/tgujar)
+- Rework cudf::replace_nulls to use strings::detail::copy_if_else ([#15286](https://github.com/rapidsai/cudf/pull/15286)) [@davidwendt](https://github.com/davidwendt)
+- Clean up special casing in `as_column` for non-typed input ([#15276](https://github.com/rapidsai/cudf/pull/15276)) [@mroeschke](https://github.com/mroeschke)
+- Large strings support in cudf::concatenate ([#15195](https://github.com/rapidsai/cudf/pull/15195)) [@davidwendt](https://github.com/davidwendt)
+- Use less _is_categorical_dtype ([#15148](https://github.com/rapidsai/cudf/pull/15148)) [@mroeschke](https://github.com/mroeschke)
+- Align date_range defaults with pandas, support tz ([#15139](https://github.com/rapidsai/cudf/pull/15139)) [@mroeschke](https://github.com/mroeschke)
+- `ModuleAccelerator` performance: cache the result of checking if a caller is in the denylist ([#15056](https://github.com/rapidsai/cudf/pull/15056)) [@shwina](https://github.com/shwina)
+- Use offsetalator in cudf::strings::replace functions ([#14824](https://github.com/rapidsai/cudf/pull/14824)) [@davidwendt](https://github.com/davidwendt)
+- Cleanup some timedelta/datetime column logic ([#14715](https://github.com/rapidsai/cudf/pull/14715)) [@mroeschke](https://github.com/mroeschke)
+- Refactor numpy array input in as_column ([#14651](https://github.com/rapidsai/cudf/pull/14651)) [@mroeschke](https://github.com/mroeschke)
+- Refactor joins for conditional semis and antis ([#14646](https://github.com/rapidsai/cudf/pull/14646)) [@DanialJavady96](https://github.com/DanialJavady96)
+- Eagerly populate the class dict for cudf.pandas proxy types ([#14534](https://github.com/rapidsai/cudf/pull/14534)) [@shwina](https://github.com/shwina)
+- Some additional kernel thread index refactoring. ([#14107](https://github.com/rapidsai/cudf/pull/14107)) [@bdice](https://github.com/bdice)
+
 # cuDF 24.04.00 (10 Apr 2024)
 
 ## 🚨 Breaking Changes